ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "includes.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25 #include "popt.h"
  26 #include "cmdline.h"
  27 #include "../include/ctdb_client.h"
  28 #include "../include/ctdb_private.h"
  29 #include "lib/tdb_wrap/tdb_wrap.h"
  30 #include "lib/util/dlinklist.h"
  31
  32
  33 /* List of SRVID requests that need to be processed */
  34 struct srvid_list {
  35         struct srvid_list *next, *prev;
  36         struct srvid_request *request;
  37 };
  38
  39 struct srvid_requests {
  40         struct srvid_list *requests;
  41 };
  42
  43 static void srvid_request_reply(struct ctdb_context *ctdb,
  44                                 struct srvid_request *request,
  45                                 TDB_DATA result)
  46 {
  47         /* Someone that sent srvid==0 does not want a reply */
  48         if (request->srvid == 0) {
  49                 talloc_free(request);
  50                 return;
  51         }
  52
  53         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  54                                      result) == 0) {
  55                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  56                                   (unsigned)request->pnn,
  57                                   (unsigned long long)request->srvid));
  58         } else {
  59                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  60                                  (unsigned)request->pnn,
  61                                  (unsigned long long)request->srvid));
  62         }
  63
  64         talloc_free(request);
  65 }
  66
  67 static void srvid_requests_reply(struct ctdb_context *ctdb,
  68                                  struct srvid_requests **requests,
  69                                  TDB_DATA result)
  70 {
  71         struct srvid_list *r;
  72
  73         for (r = (*requests)->requests; r != NULL; r = r->next) {
  74                 srvid_request_reply(ctdb, r->request, result);
  75         }
  76
  77         /* Free the list structure... */
  78         TALLOC_FREE(*requests);
  79 }
  80
  81 static void srvid_request_add(struct ctdb_context *ctdb,
  82                               struct srvid_requests **requests,
  83                               struct srvid_request *request)
  84 {
  85         struct srvid_list *t;
  86         int32_t ret;
  87         TDB_DATA result;
  88
  89         if (*requests == NULL) {
  90                 *requests = talloc_zero(ctdb, struct srvid_requests);
  91                 if (*requests == NULL) {
  92                         goto nomem;
  93                 }
  94         }
  95
  96         t = talloc_zero(*requests, struct srvid_list);
  97         if (t == NULL) {
  98                 /* If *requests was just allocated above then free it */
  99                 if ((*requests)->requests == NULL) {
 100                         TALLOC_FREE(*requests);
 101                 }
 102                 goto nomem;
 103         }
 104
 105         t->request = (struct srvid_request *)talloc_steal(t, request);
 106         DLIST_ADD((*requests)->requests, t);
 107
 108         return;
 109
 110 nomem:
 111         /* Failed to add the request to the list.  Send a fail. */
 112         DEBUG(DEBUG_ERR, (__location__
 113                           " Out of memory, failed to queue SRVID request\n"));
 114         ret = -ENOMEM;
 115         result.dsize = sizeof(ret);
 116         result.dptr = (uint8_t *)&ret;
 117         srvid_request_reply(ctdb, request, result);
 118 }
 119
 120 struct ctdb_banning_state {
 121         uint32_t count;
 122         struct timeval last_reported_time;
 123 };
 124
 125 /*
 126   private state of recovery daemon
 127  */
 128 struct ctdb_recoverd {
 129         struct ctdb_context *ctdb;
 130         uint32_t recmaster;
 131         uint32_t num_active;
 132         uint32_t num_lmasters;
 133         uint32_t num_connected;
 134         uint32_t last_culprit_node;
 135         struct ctdb_node_map *nodemap;
 136         struct timeval priority_time;
 137         bool need_takeover_run;
 138         bool need_recovery;
 139         uint32_t node_flags;
 140         struct timed_event *send_election_te;
 141         struct timed_event *election_timeout;
 142         struct vacuum_info *vacuum_info;
 143         struct srvid_requests *reallocate_requests;
 144         bool takeover_run_in_progress;
 145         TALLOC_CTX *takeover_runs_disable_ctx;
 146         struct ctdb_control_get_ifaces *ifaces;
 147         uint32_t *force_rebalance_nodes;
 148 };
 149
 150 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 151 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 152
 153 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
 154
 155 /*
 156   ban a node for a period of time
 157  */
 158 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 159 {
 160         int ret;
 161         struct ctdb_context *ctdb = rec->ctdb;
 162         struct ctdb_ban_time bantime;
 163
 164         if (!ctdb_validate_pnn(ctdb, pnn)) {
 165                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 166                 return;
 167         }
 168
 169         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 170
 171         bantime.pnn  = pnn;
 172         bantime.time = ban_time;
 173
 174         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 175         if (ret != 0) {
 176                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 177                 return;
 178         }
 179
 180 }
 181
 182 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 183
 184
 185 /*
 186   remember the trouble maker
 187  */
 188 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 189 {
 190         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 191         struct ctdb_banning_state *ban_state;
 192
 193         if (culprit > ctdb->num_nodes) {
 194                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 195                 return;
 196         }
 197
 198         /* If we are banned or stopped, do not set other nodes as culprits */
 199         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 200                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 201                 return;
 202         }
 203
 204         if (ctdb->nodes[culprit]->ban_state == NULL) {
 205                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 206                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 207
 208
 209         }
 210         ban_state = ctdb->nodes[culprit]->ban_state;
 211         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 212                 /* this was the first time in a long while this node
 213                    misbehaved so we will forgive any old transgressions.
 214                 */
 215                 ban_state->count = 0;
 216         }
 217
 218         ban_state->count += count;
 219         ban_state->last_reported_time = timeval_current();
 220         rec->last_culprit_node = culprit;
 221 }
 222
 223 /*
 224   remember the trouble maker
 225  */
 226 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 227 {
 228         ctdb_set_culprit_count(rec, culprit, 1);
 229 }
 230
 231
 232 /* this callback is called for every node that failed to execute the
 233    recovered event
 234 */
 235 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 236 {
 237         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 238
 239         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
 240
 241         ctdb_set_culprit(rec, node_pnn);
 242 }
 243
 244 /*
 245   run the "recovered" eventscript on all nodes
 246  */
 247 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
 248 {
 249         TALLOC_CTX *tmp_ctx;
 250         uint32_t *nodes;
 251         struct ctdb_context *ctdb = rec->ctdb;
 252
 253         tmp_ctx = talloc_new(ctdb);
 254         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 255
 256         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 257         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
 258                                         nodes, 0,
 259                                         CONTROL_TIMEOUT(), false, tdb_null,
 260                                         NULL, recovered_fail_callback,
 261                                         rec) != 0) {
 262                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
 263
 264                 talloc_free(tmp_ctx);
 265                 return -1;
 266         }
 267
 268         talloc_free(tmp_ctx);
 269         return 0;
 270 }
 271
 272 /* this callback is called for every node that failed to execute the
 273    start recovery event
 274 */
 275 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 276 {
 277         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 278
 279         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
 280
 281         ctdb_set_culprit(rec, node_pnn);
 282 }
 283
 284 /*
 285   run the "startrecovery" eventscript on all nodes
 286  */
 287 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
 288 {
 289         TALLOC_CTX *tmp_ctx;
 290         uint32_t *nodes;
 291         struct ctdb_context *ctdb = rec->ctdb;
 292
 293         tmp_ctx = talloc_new(ctdb);
 294         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 295
 296         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 297         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
 298                                         nodes, 0,
 299                                         CONTROL_TIMEOUT(), false, tdb_null,
 300                                         NULL,
 301                                         startrecovery_fail_callback,
 302                                         rec) != 0) {
 303                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
 304                 talloc_free(tmp_ctx);
 305                 return -1;
 306         }
 307
 308         talloc_free(tmp_ctx);
 309         return 0;
 310 }
 311
 312 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 313 {
 314         if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
 315                 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
 316                 return;
 317         }
 318         if (node_pnn < ctdb->num_nodes) {
 319                 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
 320         }
 321
 322         if (node_pnn == ctdb->pnn) {
 323                 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
 324         }
 325 }
 326
 327 /*
 328   update the node capabilities for all connected nodes
 329  */
 330 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 331 {
 332         uint32_t *nodes;
 333         TALLOC_CTX *tmp_ctx;
 334
 335         tmp_ctx = talloc_new(ctdb);
 336         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 337
 338         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
 339         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
 340                                         nodes, 0,
 341                                         CONTROL_TIMEOUT(),
 342                                         false, tdb_null,
 343                                         async_getcap_callback, NULL,
 344                                         NULL) != 0) {
 345                 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
 346                 talloc_free(tmp_ctx);
 347                 return -1;
 348         }
 349
 350         talloc_free(tmp_ctx);
 351         return 0;
 352 }
 353
 354 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 355 {
 356         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 357
 358         DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 359         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 360 }
 361
 362 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 363 {
 364         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 365
 366         DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 367         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 368 }
 369
 370 /*
 371   change recovery mode on all nodes
 372  */
 373 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
 374 {
 375         TDB_DATA data;
 376         uint32_t *nodes;
 377         TALLOC_CTX *tmp_ctx;
 378
 379         tmp_ctx = talloc_new(ctdb);
 380         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 381
 382         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 383
 384         data.dsize = sizeof(uint32_t);
 385         data.dptr = (unsigned char *)&rec_mode;
 386
 387         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 388                                         nodes, 0,
 389                                         CONTROL_TIMEOUT(),
 390                                         false, data,
 391                                         NULL, NULL,
 392                                         NULL) != 0) {
 393                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 394                 talloc_free(tmp_ctx);
 395                 return -1;
 396         }
 397
 398         /* freeze all nodes */
 399         if (rec_mode == CTDB_RECOVERY_ACTIVE) {
 400                 int i;
 401
 402                 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
 403                         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
 404                                                 nodes, i,
 405                                                 CONTROL_TIMEOUT(),
 406                                                 false, tdb_null,
 407                                                 NULL,
 408                                                 set_recmode_fail_callback,
 409                                                 rec) != 0) {
 410                                 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
 411                                 talloc_free(tmp_ctx);
 412                                 return -1;
 413                         }
 414                 }
 415         }
 416
 417         talloc_free(tmp_ctx);
 418         return 0;
 419 }
 420
 421 /*
 422   change recovery master on all node
 423  */
 424 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
 425 {
 426         TDB_DATA data;
 427         TALLOC_CTX *tmp_ctx;
 428         uint32_t *nodes;
 429
 430         tmp_ctx = talloc_new(ctdb);
 431         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 432
 433         data.dsize = sizeof(uint32_t);
 434         data.dptr = (unsigned char *)&pnn;
 435
 436         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 437         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
 438                                         nodes, 0,
 439                                         CONTROL_TIMEOUT(), false, data,
 440                                         NULL, NULL,
 441                                         NULL) != 0) {
 442                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
 443                 talloc_free(tmp_ctx);
 444                 return -1;
 445         }
 446
 447         talloc_free(tmp_ctx);
 448         return 0;
 449 }
 450
 451 /* update all remote nodes to use the same db priority that we have
 452    this can fail if the remove node has not yet been upgraded to
 453    support this function, so we always return success and never fail
 454    a recovery if this call fails.
 455 */
 456 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
 457         struct ctdb_node_map *nodemap,
 458         uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 459 {
 460         int db;
 461
 462         /* step through all local databases */
 463         for (db=0; db<dbmap->num;db++) {
 464                 struct ctdb_db_priority db_prio;
 465                 int ret;
 466
 467                 db_prio.db_id     = dbmap->dbs[db].dbid;
 468                 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
 469                 if (ret != 0) {
 470                         DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
 471                         continue;
 472                 }
 473
 474                 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
 475
 476                 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
 477                                                 CTDB_CURRENT_NODE, &db_prio);
 478                 if (ret != 0) {
 479                         DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
 480                                          db_prio.db_id));
 481                 }
 482         }
 483
 484         return 0;
 485 }
 486
 487 /*
 488   ensure all other nodes have attached to any databases that we have
 489  */
 490 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 491                                            uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 492 {
 493         int i, j, db, ret;
 494         struct ctdb_dbid_map *remote_dbmap;
 495
 496         /* verify that all other nodes have all our databases */
 497         for (j=0; j<nodemap->num; j++) {
 498                 /* we dont need to ourself ourselves */
 499                 if (nodemap->nodes[j].pnn == pnn) {
 500                         continue;
 501                 }
 502                 /* dont check nodes that are unavailable */
 503                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 504                         continue;
 505                 }
 506
 507                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 508                                          mem_ctx, &remote_dbmap);
 509                 if (ret != 0) {
 510                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 511                         return -1;
 512                 }
 513
 514                 /* step through all local databases */
 515                 for (db=0; db<dbmap->num;db++) {
 516                         const char *name;
 517
 518
 519                         for (i=0;i<remote_dbmap->num;i++) {
 520                                 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
 521                                         break;
 522                                 }
 523                         }
 524                         /* the remote node already have this database */
 525                         if (i!=remote_dbmap->num) {
 526                                 continue;
 527                         }
 528                         /* ok so we need to create this database */
 529                         ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
 530                                                   dbmap->dbs[db].dbid, mem_ctx,
 531                                                   &name);
 532                         if (ret != 0) {
 533                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 534                                 return -1;
 535                         }
 536                         ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
 537                                                  nodemap->nodes[j].pnn,
 538                                                  mem_ctx, name,
 539                                                  dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 540                         if (ret != 0) {
 541                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 542                                 return -1;
 543                         }
 544                 }
 545         }
 546
 547         return 0;
 548 }
 549
 550
 551 /*
 552   ensure we are attached to any databases that anyone else is attached to
 553  */
 554 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 555                                           uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
 556 {
 557         int i, j, db, ret;
 558         struct ctdb_dbid_map *remote_dbmap;
 559
 560         /* verify that we have all database any other node has */
 561         for (j=0; j<nodemap->num; j++) {
 562                 /* we dont need to ourself ourselves */
 563                 if (nodemap->nodes[j].pnn == pnn) {
 564                         continue;
 565                 }
 566                 /* dont check nodes that are unavailable */
 567                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 568                         continue;
 569                 }
 570
 571                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 572                                          mem_ctx, &remote_dbmap);
 573                 if (ret != 0) {
 574                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 575                         return -1;
 576                 }
 577
 578                 /* step through all databases on the remote node */
 579                 for (db=0; db<remote_dbmap->num;db++) {
 580                         const char *name;
 581
 582                         for (i=0;i<(*dbmap)->num;i++) {
 583                                 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
 584                                         break;
 585                                 }
 586                         }
 587                         /* we already have this db locally */
 588                         if (i!=(*dbmap)->num) {
 589                                 continue;
 590                         }
 591                         /* ok so we need to create this database and
 592                            rebuild dbmap
 593                          */
 594                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 595                                             remote_dbmap->dbs[db].dbid, mem_ctx, &name);
 596                         if (ret != 0) {
 597                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 598                                           nodemap->nodes[j].pnn));
 599                                 return -1;
 600                         }
 601                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
 602                                            remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 603                         if (ret != 0) {
 604                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 605                                 return -1;
 606                         }
 607                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 608                         if (ret != 0) {
 609                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 610                                 return -1;
 611                         }
 612                 }
 613         }
 614
 615         return 0;
 616 }
 617
 618
 619 /*
 620   pull the remote database contents from one node into the recdb
 621  */
 622 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 623                                     struct tdb_wrap *recdb, uint32_t dbid)
 624 {
 625         int ret;
 626         TDB_DATA outdata;
 627         struct ctdb_marshall_buffer *reply;
 628         struct ctdb_rec_data *rec;
 629         int i;
 630         TALLOC_CTX *tmp_ctx = talloc_new(recdb);
 631
 632         ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
 633                                CONTROL_TIMEOUT(), &outdata);
 634         if (ret != 0) {
 635                 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
 636                 talloc_free(tmp_ctx);
 637                 return -1;
 638         }
 639
 640         reply = (struct ctdb_marshall_buffer *)outdata.dptr;
 641
 642         if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
 643                 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
 644                 talloc_free(tmp_ctx);
 645                 return -1;
 646         }
 647
 648         rec = (struct ctdb_rec_data *)&reply->data[0];
 649
 650         for (i=0;
 651              i<reply->count;
 652              rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
 653                 TDB_DATA key, data;
 654                 struct ctdb_ltdb_header *hdr;
 655                 TDB_DATA existing;
 656
 657                 key.dptr = &rec->data[0];
 658                 key.dsize = rec->keylen;
 659                 data.dptr = &rec->data[key.dsize];
 660                 data.dsize = rec->datalen;
 661
 662                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 663
 664                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 665                         DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
 666                         talloc_free(tmp_ctx);
 667                         return -1;
 668                 }
 669
 670                 /* fetch the existing record, if any */
 671                 existing = tdb_fetch(recdb->tdb, key);
 672
 673                 if (existing.dptr != NULL) {
 674                         struct ctdb_ltdb_header header;
 675                         if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
 676                                 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
 677                                          (unsigned)existing.dsize, srcnode));
 678                                 free(existing.dptr);
 679                                 talloc_free(tmp_ctx);
 680                                 return -1;
 681                         }
 682                         header = *(struct ctdb_ltdb_header *)existing.dptr;
 683                         free(existing.dptr);
 684                         if (!(header.rsn < hdr->rsn ||
 685                               (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
 686                                 continue;
 687                         }
 688                 }
 689
 690                 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
 691                         DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
 692                         talloc_free(tmp_ctx);
 693                         return -1;
 694                 }
 695         }
 696
 697         talloc_free(tmp_ctx);
 698
 699         return 0;
 700 }
 701
 702
 703 struct pull_seqnum_cbdata {
 704         int failed;
 705         uint32_t pnn;
 706         uint64_t seqnum;
 707 };
 708
 709 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 710 {
 711         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 712         uint64_t seqnum;
 713
 714         if (cb_data->failed != 0) {
 715                 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
 716                 return;
 717         }
 718
 719         if (res != 0) {
 720                 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
 721                 cb_data->failed = 1;
 722                 return;
 723         }
 724
 725         if (outdata.dsize != sizeof(uint64_t)) {
 726                 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
 727                 cb_data->failed = -1;
 728                 return;
 729         }
 730
 731         seqnum = *((uint64_t *)outdata.dptr);
 732
 733         if (seqnum > cb_data->seqnum ||
 734             (cb_data->pnn == -1 && seqnum == 0)) {
 735                 cb_data->seqnum = seqnum;
 736                 cb_data->pnn = node_pnn;
 737         }
 738 }
 739
 740 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 741 {
 742         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 743
 744         DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
 745         cb_data->failed = 1;
 746 }
 747
 748 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
 749                                 struct ctdb_recoverd *rec,
 750                                 struct ctdb_node_map *nodemap,
 751                                 struct tdb_wrap *recdb, uint32_t dbid)
 752 {
 753         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
 754         uint32_t *nodes;
 755         TDB_DATA data;
 756         uint32_t outdata[2];
 757         struct pull_seqnum_cbdata *cb_data;
 758
 759         DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
 760
 761         outdata[0] = dbid;
 762         outdata[1] = 0;
 763
 764         data.dsize = sizeof(outdata);
 765         data.dptr  = (uint8_t *)&outdata[0];
 766
 767         cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
 768         if (cb_data == NULL) {
 769                 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
 770                 talloc_free(tmp_ctx);
 771                 return -1;
 772         }
 773
 774         cb_data->failed = 0;
 775         cb_data->pnn    = -1;
 776         cb_data->seqnum = 0;
 777
 778         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 779         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
 780                                         nodes, 0,
 781                                         CONTROL_TIMEOUT(), false, data,
 782                                         pull_seqnum_cb,
 783                                         pull_seqnum_fail_cb,
 784                                         cb_data) != 0) {
 785                 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
 786
 787                 talloc_free(tmp_ctx);
 788                 return -1;
 789         }
 790
 791         if (cb_data->failed != 0) {
 792                 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
 793                 talloc_free(tmp_ctx);
 794                 return -1;
 795         }
 796
 797         if (cb_data->pnn == -1) {
 798                 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
 799                 talloc_free(tmp_ctx);
 800                 return -1;
 801         }
 802
 803         DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
 804
 805         if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
 806                 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
 807                 talloc_free(tmp_ctx);
 808                 return -1;
 809         }
 810
 811         talloc_free(tmp_ctx);
 812         return 0;
 813 }
 814
 815
 816 /*
 817   pull all the remote database contents into the recdb
 818  */
 819 static int pull_remote_database(struct ctdb_context *ctdb,
 820                                 struct ctdb_recoverd *rec,
 821                                 struct ctdb_node_map *nodemap,
 822                                 struct tdb_wrap *recdb, uint32_t dbid,
 823                                 bool persistent)
 824 {
 825         int j;
 826
 827         if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
 828                 int ret;
 829                 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
 830                 if (ret == 0) {
 831                         return 0;
 832                 }
 833         }
 834
 835         /* pull all records from all other nodes across onto this node
 836            (this merges based on rsn)
 837         */
 838         for (j=0; j<nodemap->num; j++) {
 839                 /* dont merge from nodes that are unavailable */
 840                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 841                         continue;
 842                 }
 843                 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 844                         DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
 845                                  nodemap->nodes[j].pnn));
 846                         ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 847                         return -1;
 848                 }
 849         }
 850
 851         return 0;
 852 }
 853
 854
 855 /*
 856   update flags on all active nodes
 857  */
 858 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
 859 {
 860         int ret;
 861
 862         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 863                 if (ret != 0) {
 864                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 865                 return -1;
 866         }
 867
 868         return 0;
 869 }
 870
 871 /*
 872   ensure all nodes have the same vnnmap we do
 873  */
 874 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 875                                       uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
 876 {
 877         int j, ret;
 878
 879         /* push the new vnn map out to all the nodes */
 880         for (j=0; j<nodemap->num; j++) {
 881                 /* dont push to nodes that are unavailable */
 882                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 883                         continue;
 884                 }
 885
 886                 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
 887                 if (ret != 0) {
 888                         DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
 889                         return -1;
 890                 }
 891         }
 892
 893         return 0;
 894 }
 895
 896
 897 struct vacuum_info {
 898         struct vacuum_info *next, *prev;
 899         struct ctdb_recoverd *rec;
 900         uint32_t srcnode;
 901         struct ctdb_db_context *ctdb_db;
 902         struct ctdb_marshall_buffer *recs;
 903         struct ctdb_rec_data *r;
 904 };
 905
 906 static void vacuum_fetch_next(struct vacuum_info *v);
 907
 908 /*
 909   called when a vacuum fetch has completed - just free it and do the next one
 910  */
 911 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 912 {
 913         talloc_free(state);
 914 }
 915
 916
 917 /*
 918   process the next element from the vacuum list
 919 */
 920 static void vacuum_fetch_next(struct vacuum_info *v)
 921 {
 922         struct ctdb_call call;
 923         struct ctdb_rec_data *r;
 924
 925         while (v->recs->count) {
 926                 struct ctdb_client_call_state *state;
 927                 TDB_DATA data;
 928                 struct ctdb_ltdb_header *hdr;
 929
 930                 ZERO_STRUCT(call);
 931                 call.call_id = CTDB_NULL_FUNC;
 932                 call.flags = CTDB_IMMEDIATE_MIGRATION;
 933                 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
 934
 935                 r = v->r;
 936                 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
 937                 v->recs->count--;
 938
 939                 call.key.dptr = &r->data[0];
 940                 call.key.dsize = r->keylen;
 941
 942                 /* ensure we don't block this daemon - just skip a record if we can't get
 943                    the chainlock */
 944                 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
 945                         continue;
 946                 }
 947
 948                 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
 949                 if (data.dptr == NULL) {
 950                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 951                         continue;
 952                 }
 953
 954                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 955                         free(data.dptr);
 956                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 957                         continue;
 958                 }
 959
 960                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 961                 if (hdr->dmaster == v->rec->ctdb->pnn) {
 962                         /* its already local */
 963                         free(data.dptr);
 964                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 965                         continue;
 966                 }
 967
 968                 free(data.dptr);
 969
 970                 state = ctdb_call_send(v->ctdb_db, &call);
 971                 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 972                 if (state == NULL) {
 973                         DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
 974                         talloc_free(v);
 975                         return;
 976                 }
 977                 state->async.fn = vacuum_fetch_callback;
 978                 state->async.private_data = NULL;
 979         }
 980
 981         talloc_free(v);
 982 }
 983
 984
 985 /*
 986   destroy a vacuum info structure
 987  */
 988 static int vacuum_info_destructor(struct vacuum_info *v)
 989 {
 990         DLIST_REMOVE(v->rec->vacuum_info, v);
 991         return 0;
 992 }
 993
 994
 995 /*
 996   handler for vacuum fetch
 997 */
 998 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
 999                                  TDB_DATA data, void *private_data)
1000 {
1001         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1002         struct ctdb_marshall_buffer *recs;
1003         int ret, i;
1004         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1005         const char *name;
1006         struct ctdb_dbid_map *dbmap=NULL;
1007         bool persistent = false;
1008         struct ctdb_db_context *ctdb_db;
1009         struct ctdb_rec_data *r;
1010         uint32_t srcnode;
1011         struct vacuum_info *v;
1012
1013         recs = (struct ctdb_marshall_buffer *)data.dptr;
1014         r = (struct ctdb_rec_data *)&recs->data[0];
1015
1016         if (recs->count == 0) {
1017                 talloc_free(tmp_ctx);
1018                 return;
1019         }
1020
1021         srcnode = r->reqid;
1022
1023         for (v=rec->vacuum_info;v;v=v->next) {
1024                 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1025                         /* we're already working on records from this node */
1026                         talloc_free(tmp_ctx);
1027                         return;
1028                 }
1029         }
1030
1031         /* work out if the database is persistent */
1032         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1033         if (ret != 0) {
1034                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1035                 talloc_free(tmp_ctx);
1036                 return;
1037         }
1038
1039         for (i=0;i<dbmap->num;i++) {
1040                 if (dbmap->dbs[i].dbid == recs->db_id) {
1041                         persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1042                         break;
1043                 }
1044         }
1045         if (i == dbmap->num) {
1046                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1047                 talloc_free(tmp_ctx);
1048                 return;
1049         }
1050
1051         /* find the name of this database */
1052         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1053                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1054                 talloc_free(tmp_ctx);
1055                 return;
1056         }
1057
1058         /* attach to it */
1059         ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1060         if (ctdb_db == NULL) {
1061                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1062                 talloc_free(tmp_ctx);
1063                 return;
1064         }
1065
1066         v = talloc_zero(rec, struct vacuum_info);
1067         if (v == NULL) {
1068                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1069                 talloc_free(tmp_ctx);
1070                 return;
1071         }
1072
1073         v->rec = rec;
1074         v->srcnode = srcnode;
1075         v->ctdb_db = ctdb_db;
1076         v->recs = talloc_memdup(v, recs, data.dsize);
1077         if (v->recs == NULL) {
1078                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1079                 talloc_free(v);
1080                 talloc_free(tmp_ctx);
1081                 return;
1082         }
1083         v->r =  (struct ctdb_rec_data *)&v->recs->data[0];
1084
1085         DLIST_ADD(rec->vacuum_info, v);
1086
1087         talloc_set_destructor(v, vacuum_info_destructor);
1088
1089         vacuum_fetch_next(v);
1090         talloc_free(tmp_ctx);
1091 }
1092
1093
1094 /*
1095  * handler for database detach
1096  */
1097 static void detach_database_handler(struct ctdb_context *ctdb, uint64_t srvid,
1098                                     TDB_DATA data, void *private_data)
1099 {
1100         struct ctdb_recoverd *rec = talloc_get_type(private_data,
1101                                                     struct ctdb_recoverd);
1102         uint32_t db_id;
1103         struct vacuum_info *v, *vnext;
1104         struct ctdb_db_context *ctdb_db;
1105
1106         if (data.dsize != sizeof(db_id)) {
1107                 return;
1108         }
1109         db_id = *(uint32_t *)data.dptr;
1110
1111         ctdb_db = find_ctdb_db(ctdb, db_id);
1112         if (ctdb_db == NULL) {
1113                 /* database is not attached */
1114                 return;
1115         }
1116
1117         /* Stop any active vacuum fetch */
1118         v = rec->vacuum_info;
1119         while (v != NULL) {
1120                 vnext = v->next;
1121
1122                 if (v->ctdb_db->db_id == db_id) {
1123                         talloc_free(v);
1124                 }
1125                 v = vnext;
1126         }
1127
1128         DLIST_REMOVE(ctdb->db_list, ctdb_db);
1129
1130         DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1131                              ctdb_db->db_name));
1132         talloc_free(ctdb_db);
1133 }
1134
1135 /*
1136   called when ctdb_wait_timeout should finish
1137  */
1138 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1139                               struct timeval yt, void *p)
1140 {
1141         uint32_t *timed_out = (uint32_t *)p;
1142         (*timed_out) = 1;
1143 }
1144
1145 /*
1146   wait for a given number of seconds
1147  */
1148 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1149 {
1150         uint32_t timed_out = 0;
1151         time_t usecs = (secs - (time_t)secs) * 1000000;
1152         event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1153         while (!timed_out) {
1154                 event_loop_once(ctdb->ev);
1155         }
1156 }
1157
1158 /*
1159   called when an election times out (ends)
1160  */
1161 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1162                                   struct timeval t, void *p)
1163 {
1164         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1165         rec->election_timeout = NULL;
1166         fast_start = false;
1167
1168         DEBUG(DEBUG_WARNING,("Election period ended\n"));
1169 }
1170
1171
1172 /*
1173   wait for an election to finish. It finished election_timeout seconds after
1174   the last election packet is received
1175  */
1176 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1177 {
1178         struct ctdb_context *ctdb = rec->ctdb;
1179         while (rec->election_timeout) {
1180                 event_loop_once(ctdb->ev);
1181         }
1182 }
1183
1184 /*
1185   Update our local flags from all remote connected nodes.
1186   This is only run when we are or we belive we are the recovery master
1187  */
1188 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1189 {
1190         int j;
1191         struct ctdb_context *ctdb = rec->ctdb;
1192         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1193
1194         /* get the nodemap for all active remote nodes and verify
1195            they are the same as for this node
1196          */
1197         for (j=0; j<nodemap->num; j++) {
1198                 struct ctdb_node_map *remote_nodemap=NULL;
1199                 int ret;
1200
1201                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1202                         continue;
1203                 }
1204                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1205                         continue;
1206                 }
1207
1208                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1209                                            mem_ctx, &remote_nodemap);
1210                 if (ret != 0) {
1211                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1212                                   nodemap->nodes[j].pnn));
1213                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1214                         talloc_free(mem_ctx);
1215                         return MONITOR_FAILED;
1216                 }
1217                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1218                         /* We should tell our daemon about this so it
1219                            updates its flags or else we will log the same
1220                            message again in the next iteration of recovery.
1221                            Since we are the recovery master we can just as
1222                            well update the flags on all nodes.
1223                         */
1224                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1225                         if (ret != 0) {
1226                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1227                                 return -1;
1228                         }
1229
1230                         /* Update our local copy of the flags in the recovery
1231                            daemon.
1232                         */
1233                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1234                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1235                                  nodemap->nodes[j].flags));
1236                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1237                 }
1238                 talloc_free(remote_nodemap);
1239         }
1240         talloc_free(mem_ctx);
1241         return MONITOR_OK;
1242 }
1243
1244
1245 /* Create a new random generation ip.
1246    The generation id can not be the INVALID_GENERATION id
1247 */
1248 static uint32_t new_generation(void)
1249 {
1250         uint32_t generation;
1251
1252         while (1) {
1253                 generation = random();
1254
1255                 if (generation != INVALID_GENERATION) {
1256                         break;
1257                 }
1258         }
1259
1260         return generation;
1261 }
1262
1263
1264 /*
1265   create a temporary working database
1266  */
1267 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1268 {
1269         char *name;
1270         struct tdb_wrap *recdb;
1271         unsigned tdb_flags;
1272
1273         /* open up the temporary recovery database */
1274         name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1275                                ctdb->db_directory_state,
1276                                ctdb->pnn);
1277         if (name == NULL) {
1278                 return NULL;
1279         }
1280         unlink(name);
1281
1282         tdb_flags = TDB_NOLOCK;
1283         if (ctdb->valgrinding) {
1284                 tdb_flags |= TDB_NOMMAP;
1285         }
1286         tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1287
1288         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1289                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1290         if (recdb == NULL) {
1291                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1292         }
1293
1294         talloc_free(name);
1295
1296         return recdb;
1297 }
1298
1299
1300 /*
1301    a traverse function for pulling all relevant records from recdb
1302  */
1303 struct recdb_data {
1304         struct ctdb_context *ctdb;
1305         struct ctdb_marshall_buffer *recdata;
1306         uint32_t len;
1307         uint32_t allocated_len;
1308         bool failed;
1309         bool persistent;
1310 };
1311
1312 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1313 {
1314         struct recdb_data *params = (struct recdb_data *)p;
1315         struct ctdb_rec_data *rec;
1316         struct ctdb_ltdb_header *hdr;
1317
1318         /*
1319          * skip empty records - but NOT for persistent databases:
1320          *
1321          * The record-by-record mode of recovery deletes empty records.
1322          * For persistent databases, this can lead to data corruption
1323          * by deleting records that should be there:
1324          *
1325          * - Assume the cluster has been running for a while.
1326          *
1327          * - A record R in a persistent database has been created and
1328          *   deleted a couple of times, the last operation being deletion,
1329          *   leaving an empty record with a high RSN, say 10.
1330          *
1331          * - Now a node N is turned off.
1332          *
1333          * - This leaves the local database copy of D on N with the empty
1334          *   copy of R and RSN 10. On all other nodes, the recovery has deleted
1335          *   the copy of record R.
1336          *
1337          * - Now the record is created again while node N is turned off.
1338          *   This creates R with RSN = 1 on all nodes except for N.
1339          *
1340          * - Now node N is turned on again. The following recovery will chose
1341          *   the older empty copy of R due to RSN 10 > RSN 1.
1342          *
1343          * ==> Hence the record is gone after the recovery.
1344          *
1345          * On databases like Samba's registry, this can damage the higher-level
1346          * data structures built from the various tdb-level records.
1347          */
1348         if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1349                 return 0;
1350         }
1351
1352         /* update the dmaster field to point to us */
1353         hdr = (struct ctdb_ltdb_header *)data.dptr;
1354         if (!params->persistent) {
1355                 hdr->dmaster = params->ctdb->pnn;
1356                 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1357         }
1358
1359         /* add the record to the blob ready to send to the nodes */
1360         rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1361         if (rec == NULL) {
1362                 params->failed = true;
1363                 return -1;
1364         }
1365         if (params->len + rec->length >= params->allocated_len) {
1366                 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1367                 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1368         }
1369         if (params->recdata == NULL) {
1370                 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1371                          rec->length + params->len));
1372                 params->failed = true;
1373                 return -1;
1374         }
1375         params->recdata->count++;
1376         memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1377         params->len += rec->length;
1378         talloc_free(rec);
1379
1380         return 0;
1381 }
1382
1383 /*
1384   push the recdb database out to all nodes
1385  */
1386 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1387                                bool persistent,
1388                                struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1389 {
1390         struct recdb_data params;
1391         struct ctdb_marshall_buffer *recdata;
1392         TDB_DATA outdata;
1393         TALLOC_CTX *tmp_ctx;
1394         uint32_t *nodes;
1395
1396         tmp_ctx = talloc_new(ctdb);
1397         CTDB_NO_MEMORY(ctdb, tmp_ctx);
1398
1399         recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1400         CTDB_NO_MEMORY(ctdb, recdata);
1401
1402         recdata->db_id = dbid;
1403
1404         params.ctdb = ctdb;
1405         params.recdata = recdata;
1406         params.len = offsetof(struct ctdb_marshall_buffer, data);
1407         params.allocated_len = params.len;
1408         params.failed = false;
1409         params.persistent = persistent;
1410
1411         if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1412                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1413                 talloc_free(params.recdata);
1414                 talloc_free(tmp_ctx);
1415                 return -1;
1416         }
1417
1418         if (params.failed) {
1419                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1420                 talloc_free(params.recdata);
1421                 talloc_free(tmp_ctx);
1422                 return -1;
1423         }
1424
1425         recdata = params.recdata;
1426
1427         outdata.dptr = (void *)recdata;
1428         outdata.dsize = params.len;
1429
1430         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1431         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1432                                         nodes, 0,
1433                                         CONTROL_TIMEOUT(), false, outdata,
1434                                         NULL, NULL,
1435                                         NULL) != 0) {
1436                 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1437                 talloc_free(recdata);
1438                 talloc_free(tmp_ctx);
1439                 return -1;
1440         }
1441
1442         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1443                   dbid, recdata->count));
1444
1445         talloc_free(recdata);
1446         talloc_free(tmp_ctx);
1447
1448         return 0;
1449 }
1450
1451
1452 /*
1453   go through a full recovery on one database
1454  */
1455 static int recover_database(struct ctdb_recoverd *rec,
1456                             TALLOC_CTX *mem_ctx,
1457                             uint32_t dbid,
1458                             bool persistent,
1459                             uint32_t pnn,
1460                             struct ctdb_node_map *nodemap,
1461                             uint32_t transaction_id)
1462 {
1463         struct tdb_wrap *recdb;
1464         int ret;
1465         struct ctdb_context *ctdb = rec->ctdb;
1466         TDB_DATA data;
1467         struct ctdb_control_wipe_database w;
1468         uint32_t *nodes;
1469
1470         recdb = create_recdb(ctdb, mem_ctx);
1471         if (recdb == NULL) {
1472                 return -1;
1473         }
1474
1475         /* pull all remote databases onto the recdb */
1476         ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1477         if (ret != 0) {
1478                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1479                 return -1;
1480         }
1481
1482         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1483
1484         /* wipe all the remote databases. This is safe as we are in a transaction */
1485         w.db_id = dbid;
1486         w.transaction_id = transaction_id;
1487
1488         data.dptr = (void *)&w;
1489         data.dsize = sizeof(w);
1490
1491         nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1492         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1493                                         nodes, 0,
1494                                         CONTROL_TIMEOUT(), false, data,
1495                                         NULL, NULL,
1496                                         NULL) != 0) {
1497                 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1498                 talloc_free(recdb);
1499                 return -1;
1500         }
1501
1502         /* push out the correct database. This sets the dmaster and skips
1503            the empty records */
1504         ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1505         if (ret != 0) {
1506                 talloc_free(recdb);
1507                 return -1;
1508         }
1509
1510         /* all done with this database */
1511         talloc_free(recdb);
1512
1513         return 0;
1514 }
1515
1516 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1517                                          struct ctdb_recoverd *rec,
1518                                          struct ctdb_node_map *nodemap,
1519                                          uint32_t *culprit)
1520 {
1521         int j;
1522         int ret;
1523
1524         if (ctdb->num_nodes != nodemap->num) {
1525                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1526                                   ctdb->num_nodes, nodemap->num));
1527                 if (culprit) {
1528                         *culprit = ctdb->pnn;
1529                 }
1530                 return -1;
1531         }
1532
1533         for (j=0; j<nodemap->num; j++) {
1534                 /* For readability */
1535                 struct ctdb_node *node = ctdb->nodes[j];
1536
1537                 /* release any existing data */
1538                 if (node->known_public_ips) {
1539                         talloc_free(node->known_public_ips);
1540                         node->known_public_ips = NULL;
1541                 }
1542                 if (node->available_public_ips) {
1543                         talloc_free(node->available_public_ips);
1544                         node->available_public_ips = NULL;
1545                 }
1546
1547                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1548                         continue;
1549                 }
1550
1551                 /* Retrieve the list of known public IPs from the node */
1552                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1553                                         CONTROL_TIMEOUT(),
1554                                         node->pnn,
1555                                         ctdb->nodes,
1556                                         0,
1557                                         &node->known_public_ips);
1558                 if (ret != 0) {
1559                         DEBUG(DEBUG_ERR,
1560                               ("Failed to read known public IPs from node: %u\n",
1561                                node->pnn));
1562                         if (culprit) {
1563                                 *culprit = node->pnn;
1564                         }
1565                         return -1;
1566                 }
1567
1568                 if (ctdb->do_checkpublicip &&
1569                     rec->takeover_runs_disable_ctx == NULL &&
1570                     verify_remote_ip_allocation(ctdb,
1571                                                  node->known_public_ips,
1572                                                  node->pnn)) {
1573                         DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1574                         rec->need_takeover_run = true;
1575                 }
1576
1577                 /* Retrieve the list of available public IPs from the node */
1578                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1579                                         CONTROL_TIMEOUT(),
1580                                         node->pnn,
1581                                         ctdb->nodes,
1582                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1583                                         &node->available_public_ips);
1584                 if (ret != 0) {
1585                         DEBUG(DEBUG_ERR,
1586                               ("Failed to read available public IPs from node: %u\n",
1587                                node->pnn));
1588                         if (culprit) {
1589                                 *culprit = node->pnn;
1590                         }
1591                         return -1;
1592                 }
1593         }
1594
1595         return 0;
1596 }
1597
1598 /* when we start a recovery, make sure all nodes use the same reclock file
1599    setting
1600 */
1601 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1602 {
1603         struct ctdb_context *ctdb = rec->ctdb;
1604         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1605         TDB_DATA data;
1606         uint32_t *nodes;
1607
1608         if (ctdb->recovery_lock_file == NULL) {
1609                 data.dptr  = NULL;
1610                 data.dsize = 0;
1611         } else {
1612                 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1613                 data.dptr  = (uint8_t *)ctdb->recovery_lock_file;
1614         }
1615
1616         nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1617         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1618                                         nodes, 0,
1619                                         CONTROL_TIMEOUT(),
1620                                         false, data,
1621                                         NULL, NULL,
1622                                         rec) != 0) {
1623                 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1624                 talloc_free(tmp_ctx);
1625                 return -1;
1626         }
1627
1628         talloc_free(tmp_ctx);
1629         return 0;
1630 }
1631
1632
1633 /*
1634  * this callback is called for every node that failed to execute ctdb_takeover_run()
1635  * and set flag to re-run takeover run.
1636  */
1637 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1638 {
1639         DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1640
1641         if (callback_data != NULL) {
1642                 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1643
1644                 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1645
1646                 ctdb_set_culprit(rec, node_pnn);
1647         }
1648 }
1649
1650
1651 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1652 {
1653         struct ctdb_context *ctdb = rec->ctdb;
1654         int i;
1655         struct ctdb_banning_state *ban_state;
1656
1657         *self_ban = false;
1658         for (i=0; i<ctdb->num_nodes; i++) {
1659                 if (ctdb->nodes[i]->ban_state == NULL) {
1660                         continue;
1661                 }
1662                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1663                 if (ban_state->count < 2*ctdb->num_nodes) {
1664                         continue;
1665                 }
1666
1667                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1668                         ctdb->nodes[i]->pnn, ban_state->count,
1669                         ctdb->tunable.recovery_ban_period));
1670                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1671                 ban_state->count = 0;
1672
1673                 /* Banning ourself? */
1674                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1675                         *self_ban = true;
1676                 }
1677         }
1678 }
1679
1680 static bool do_takeover_run(struct ctdb_recoverd *rec,
1681                             struct ctdb_node_map *nodemap,
1682                             bool banning_credits_on_fail)
1683 {
1684         uint32_t *nodes = NULL;
1685         struct srvid_request_data dtr;
1686         TDB_DATA data;
1687         int i;
1688         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1689         int ret;
1690         bool ok;
1691
1692         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1693
1694         if (rec->takeover_run_in_progress) {
1695                 DEBUG(DEBUG_ERR, (__location__
1696                                   " takeover run already in progress \n"));
1697                 ok = false;
1698                 goto done;
1699         }
1700
1701         rec->takeover_run_in_progress = true;
1702
1703         /* If takeover runs are in disabled then fail... */
1704         if (rec->takeover_runs_disable_ctx != NULL) {
1705                 DEBUG(DEBUG_ERR,
1706                       ("Takeover runs are disabled so refusing to run one\n"));
1707                 ok = false;
1708                 goto done;
1709         }
1710
1711         /* Disable IP checks (takeover runs, really) on other nodes
1712          * while doing this takeover run.  This will stop those other
1713          * nodes from triggering takeover runs when think they should
1714          * be hosting an IP but it isn't yet on an interface.  Don't
1715          * wait for replies since a failure here might cause some
1716          * noise in the logs but will not actually cause a problem.
1717          */
1718         dtr.srvid = 0; /* No reply */
1719         dtr.pnn = -1;
1720
1721         data.dptr  = (uint8_t*)&dtr;
1722         data.dsize = sizeof(dtr);
1723
1724         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1725
1726         /* Disable for 60 seconds.  This can be a tunable later if
1727          * necessary.
1728          */
1729         dtr.data = 60;
1730         for (i = 0; i < talloc_array_length(nodes); i++) {
1731                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1732                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1733                                              data) != 0) {
1734                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1735                 }
1736         }
1737
1738         ret = ctdb_takeover_run(rec->ctdb, nodemap,
1739                                 rec->force_rebalance_nodes,
1740                                 takeover_fail_callback,
1741                                 banning_credits_on_fail ? rec : NULL);
1742
1743         /* Reenable takeover runs and IP checks on other nodes */
1744         dtr.data = 0;
1745         for (i = 0; i < talloc_array_length(nodes); i++) {
1746                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1747                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1748                                              data) != 0) {
1749                         DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1750                 }
1751         }
1752
1753         if (ret != 0) {
1754                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1755                 ok = false;
1756                 goto done;
1757         }
1758
1759         ok = true;
1760         /* Takeover run was successful so clear force rebalance targets */
1761         if (rebalance_nodes == rec->force_rebalance_nodes) {
1762                 TALLOC_FREE(rec->force_rebalance_nodes);
1763         } else {
1764                 DEBUG(DEBUG_WARNING,
1765                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
1766         }
1767 done:
1768         rec->need_takeover_run = !ok;
1769         talloc_free(nodes);
1770         rec->takeover_run_in_progress = false;
1771
1772         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1773         return ok;
1774 }
1775
1776
1777 /*
1778   we are the recmaster, and recovery is needed - start a recovery run
1779  */
1780 static int do_recovery(struct ctdb_recoverd *rec,
1781                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1782                        struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1783 {
1784         struct ctdb_context *ctdb = rec->ctdb;
1785         int i, j, ret;
1786         uint32_t generation;
1787         struct ctdb_dbid_map *dbmap;
1788         TDB_DATA data;
1789         uint32_t *nodes;
1790         struct timeval start_time;
1791         uint32_t culprit = (uint32_t)-1;
1792         bool self_ban;
1793
1794         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1795
1796         /* if recovery fails, force it again */
1797         rec->need_recovery = true;
1798
1799         if (rec->election_timeout) {
1800                 /* an election is in progress */
1801                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1802                 return -1;
1803         }
1804
1805         ban_misbehaving_nodes(rec, &self_ban);
1806         if (self_ban) {
1807                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1808                 return -1;
1809         }
1810
1811         if (ctdb->recovery_lock_file != NULL) {
1812                 if (ctdb_recovery_have_lock(ctdb)) {
1813                         DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1814                 } else {
1815                         start_time = timeval_current();
1816                         DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1817                                              ctdb->recovery_lock_file));
1818                         if (!ctdb_recovery_lock(ctdb)) {
1819                                 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1820                                         /* If ctdb is trying first recovery, it's
1821                                          * possible that current node does not know
1822                                          * yet who the recmaster is.
1823                                          */
1824                                         DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1825                                                           " - retrying recovery\n"));
1826                                         return -1;
1827                                 }
1828
1829                                 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1830                                                  "and ban ourself for %u seconds\n",
1831                                                  ctdb->tunable.recovery_ban_period));
1832                                 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1833                                 return -1;
1834                         }
1835                         ctdb_ctrl_report_recd_lock_latency(ctdb,
1836                                                            CONTROL_TIMEOUT(),
1837                                                            timeval_elapsed(&start_time));
1838                         DEBUG(DEBUG_NOTICE,
1839                               ("Recovery lock taken successfully by recovery daemon\n"));
1840                 }
1841         }
1842
1843         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1844
1845         /* get a list of all databases */
1846         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1847         if (ret != 0) {
1848                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1849                 return -1;
1850         }
1851
1852         /* we do the db creation before we set the recovery mode, so the freeze happens
1853            on all databases we will be dealing with. */
1854
1855         /* verify that we have all the databases any other node has */
1856         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1857         if (ret != 0) {
1858                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1859                 return -1;
1860         }
1861
1862         /* verify that all other nodes have all our databases */
1863         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1864         if (ret != 0) {
1865                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1866                 return -1;
1867         }
1868         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1869
1870         /* update the database priority for all remote databases */
1871         ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1872         if (ret != 0) {
1873                 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1874         }
1875         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1876
1877
1878         /* update all other nodes to use the same setting for reclock files
1879            as the local recovery master.
1880         */
1881         sync_recovery_lock_file_across_cluster(rec);
1882
1883         /* set recovery mode to active on all nodes */
1884         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1885         if (ret != 0) {
1886                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1887                 return -1;
1888         }
1889
1890         /* execute the "startrecovery" event script on all nodes */
1891         ret = run_startrecovery_eventscript(rec, nodemap);
1892         if (ret!=0) {
1893                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1894                 return -1;
1895         }
1896
1897         /*
1898           update all nodes to have the same flags that we have
1899          */
1900         for (i=0;i<nodemap->num;i++) {
1901                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1902                         continue;
1903                 }
1904
1905                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1906                 if (ret != 0) {
1907                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1908                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1909                         } else {
1910                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1911                                 return -1;
1912                         }
1913                 }
1914         }
1915
1916         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1917
1918         /* pick a new generation number */
1919         generation = new_generation();
1920
1921         /* change the vnnmap on this node to use the new generation
1922            number but not on any other nodes.
1923            this guarantees that if we abort the recovery prematurely
1924            for some reason (a node stops responding?)
1925            that we can just return immediately and we will reenter
1926            recovery shortly again.
1927            I.e. we deliberately leave the cluster with an inconsistent
1928            generation id to allow us to abort recovery at any stage and
1929            just restart it from scratch.
1930          */
1931         vnnmap->generation = generation;
1932         ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1933         if (ret != 0) {
1934                 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1935                 return -1;
1936         }
1937
1938         data.dptr = (void *)&generation;
1939         data.dsize = sizeof(uint32_t);
1940
1941         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1942         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1943                                         nodes, 0,
1944                                         CONTROL_TIMEOUT(), false, data,
1945                                         NULL,
1946                                         transaction_start_fail_callback,
1947                                         rec) != 0) {
1948                 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1949                 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1950                                         nodes, 0,
1951                                         CONTROL_TIMEOUT(), false, tdb_null,
1952                                         NULL,
1953                                         NULL,
1954                                         NULL) != 0) {
1955                         DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1956                 }
1957                 return -1;
1958         }
1959
1960         DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1961
1962         for (i=0;i<dbmap->num;i++) {
1963                 ret = recover_database(rec, mem_ctx,
1964                                        dbmap->dbs[i].dbid,
1965                                        dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1966                                        pnn, nodemap, generation);
1967                 if (ret != 0) {
1968                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1969                         return -1;
1970                 }
1971         }
1972
1973         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1974
1975         /* commit all the changes */
1976         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1977                                         nodes, 0,
1978                                         CONTROL_TIMEOUT(), false, data,
1979                                         NULL, NULL,
1980                                         NULL) != 0) {
1981                 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1982                 return -1;
1983         }
1984
1985         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1986
1987
1988         /* update the capabilities for all nodes */
1989         ret = update_capabilities(ctdb, nodemap);
1990         if (ret!=0) {
1991                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1992                 return -1;
1993         }
1994
1995         /* build a new vnn map with all the currently active and
1996            unbanned nodes */
1997         generation = new_generation();
1998         vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1999         CTDB_NO_MEMORY(ctdb, vnnmap);
2000         vnnmap->generation = generation;
2001         vnnmap->size = 0;
2002         vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
2003         CTDB_NO_MEMORY(ctdb, vnnmap->map);
2004         for (i=j=0;i<nodemap->num;i++) {
2005                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2006                         continue;
2007                 }
2008                 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
2009                         /* this node can not be an lmaster */
2010                         DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2011                         continue;
2012                 }
2013
2014                 vnnmap->size++;
2015                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2016                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2017                 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2018
2019         }
2020         if (vnnmap->size == 0) {
2021                 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2022                 vnnmap->size++;
2023                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2024                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2025                 vnnmap->map[0] = pnn;
2026         }
2027
2028         /* update to the new vnnmap on all nodes */
2029         ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2030         if (ret != 0) {
2031                 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2032                 return -1;
2033         }
2034
2035         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2036
2037         /* update recmaster to point to us for all nodes */
2038         ret = set_recovery_master(ctdb, nodemap, pnn);
2039         if (ret!=0) {
2040                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2041                 return -1;
2042         }
2043
2044         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2045
2046         /* disable recovery mode */
2047         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2048         if (ret != 0) {
2049                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2050                 return -1;
2051         }
2052
2053         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2054
2055         /* Fetch known/available public IPs from each active node */
2056         ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2057         if (ret != 0) {
2058                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2059                                  culprit));
2060                 rec->need_takeover_run = true;
2061                 return -1;
2062         }
2063
2064         do_takeover_run(rec, nodemap, false);
2065
2066         /* execute the "recovered" event script on all nodes */
2067         ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2068         if (ret!=0) {
2069                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2070                 return -1;
2071         }
2072
2073         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2074
2075         /* send a message to all clients telling them that the cluster
2076            has been reconfigured */
2077         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2078                                        CTDB_SRVID_RECONFIGURE, tdb_null);
2079         if (ret != 0) {
2080                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2081                 return -1;
2082         }
2083
2084         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2085
2086         rec->need_recovery = false;
2087
2088         /* we managed to complete a full recovery, make sure to forgive
2089            any past sins by the nodes that could now participate in the
2090            recovery.
2091         */
2092         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2093         for (i=0;i<nodemap->num;i++) {
2094                 struct ctdb_banning_state *ban_state;
2095
2096                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2097                         continue;
2098                 }
2099
2100                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2101                 if (ban_state == NULL) {
2102                         continue;
2103                 }
2104
2105                 ban_state->count = 0;
2106         }
2107
2108
2109         /* We just finished a recovery successfully.
2110            We now wait for rerecovery_timeout before we allow
2111            another recovery to take place.
2112         */
2113         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2114         ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
2115         DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
2116
2117         return 0;
2118 }
2119
2120
2121 /*
2122   elections are won by first checking the number of connected nodes, then
2123   the priority time, then the pnn
2124  */
2125 struct election_message {
2126         uint32_t num_connected;
2127         struct timeval priority_time;
2128         uint32_t pnn;
2129         uint32_t node_flags;
2130 };
2131
2132 /*
2133   form this nodes election data
2134  */
2135 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2136 {
2137         int ret, i;
2138         struct ctdb_node_map *nodemap;
2139         struct ctdb_context *ctdb = rec->ctdb;
2140
2141         ZERO_STRUCTP(em);
2142
2143         em->pnn = rec->ctdb->pnn;
2144         em->priority_time = rec->priority_time;
2145
2146         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2147         if (ret != 0) {
2148                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2149                 return;
2150         }
2151
2152         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2153         em->node_flags = rec->node_flags;
2154
2155         for (i=0;i<nodemap->num;i++) {
2156                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2157                         em->num_connected++;
2158                 }
2159         }
2160
2161         /* we shouldnt try to win this election if we cant be a recmaster */
2162         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2163                 em->num_connected = 0;
2164                 em->priority_time = timeval_current();
2165         }
2166
2167         talloc_free(nodemap);
2168 }
2169
2170 /*
2171   see if the given election data wins
2172  */
2173 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2174 {
2175         struct election_message myem;
2176         int cmp = 0;
2177
2178         ctdb_election_data(rec, &myem);
2179
2180         /* we cant win if we dont have the recmaster capability */
2181         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2182                 return false;
2183         }
2184
2185         /* we cant win if we are banned */
2186         if (rec->node_flags & NODE_FLAGS_BANNED) {
2187                 return false;
2188         }
2189
2190         /* we cant win if we are stopped */
2191         if (rec->node_flags & NODE_FLAGS_STOPPED) {
2192                 return false;
2193         }
2194
2195         /* we will automatically win if the other node is banned */
2196         if (em->node_flags & NODE_FLAGS_BANNED) {
2197                 return true;
2198         }
2199
2200         /* we will automatically win if the other node is banned */
2201         if (em->node_flags & NODE_FLAGS_STOPPED) {
2202                 return true;
2203         }
2204
2205         /* try to use the most connected node */
2206         if (cmp == 0) {
2207                 cmp = (int)myem.num_connected - (int)em->num_connected;
2208         }
2209
2210         /* then the longest running node */
2211         if (cmp == 0) {
2212                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2213         }
2214
2215         if (cmp == 0) {
2216                 cmp = (int)myem.pnn - (int)em->pnn;
2217         }
2218
2219         return cmp > 0;
2220 }
2221
2222 /*
2223   send out an election request
2224  */
2225 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2226 {
2227         int ret;
2228         TDB_DATA election_data;
2229         struct election_message emsg;
2230         uint64_t srvid;
2231         struct ctdb_context *ctdb = rec->ctdb;
2232
2233         srvid = CTDB_SRVID_RECOVERY;
2234
2235         ctdb_election_data(rec, &emsg);
2236
2237         election_data.dsize = sizeof(struct election_message);
2238         election_data.dptr  = (unsigned char *)&emsg;
2239
2240
2241         /* first we assume we will win the election and set
2242            recoverymaster to be ourself on the current node
2243          */
2244         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2245         if (ret != 0) {
2246                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2247                 return -1;
2248         }
2249
2250
2251         /* send an election message to all active nodes */
2252         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2253         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2254 }
2255
2256 /*
2257   this function will unban all nodes in the cluster
2258 */
2259 static void unban_all_nodes(struct ctdb_context *ctdb)
2260 {
2261         int ret, i;
2262         struct ctdb_node_map *nodemap;
2263         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2264
2265         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2266         if (ret != 0) {
2267                 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2268                 return;
2269         }
2270
2271         for (i=0;i<nodemap->num;i++) {
2272                 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2273                   && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2274                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2275                                                  nodemap->nodes[i].pnn, 0,
2276                                                  NODE_FLAGS_BANNED);
2277                         if (ret != 0) {
2278                                 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2279                         }
2280                 }
2281         }
2282
2283         talloc_free(tmp_ctx);
2284 }
2285
2286
2287 /*
2288   we think we are winning the election - send a broadcast election request
2289  */
2290 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2291 {
2292         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2293         int ret;
2294
2295         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2296         if (ret != 0) {
2297                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2298         }
2299
2300         talloc_free(rec->send_election_te);
2301         rec->send_election_te = NULL;
2302 }
2303
2304 /*
2305   handler for memory dumps
2306 */
2307 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2308                              TDB_DATA data, void *private_data)
2309 {
2310         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2311         TDB_DATA *dump;
2312         int ret;
2313         struct srvid_request *rd;
2314
2315         if (data.dsize != sizeof(struct srvid_request)) {
2316                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2317                 talloc_free(tmp_ctx);
2318                 return;
2319         }
2320         rd = (struct srvid_request *)data.dptr;
2321
2322         dump = talloc_zero(tmp_ctx, TDB_DATA);
2323         if (dump == NULL) {
2324                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2325                 talloc_free(tmp_ctx);
2326                 return;
2327         }
2328         ret = ctdb_dump_memory(ctdb, dump);
2329         if (ret != 0) {
2330                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2331                 talloc_free(tmp_ctx);
2332                 return;
2333         }
2334
2335 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2336
2337         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2338         if (ret != 0) {
2339                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2340                 talloc_free(tmp_ctx);
2341                 return;
2342         }
2343
2344         talloc_free(tmp_ctx);
2345 }
2346
2347 /*
2348   handler for reload_nodes
2349 */
2350 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2351                              TDB_DATA data, void *private_data)
2352 {
2353         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2354
2355         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2356
2357         ctdb_load_nodes_file(rec->ctdb);
2358 }
2359
2360
2361 static void ctdb_rebalance_timeout(struct event_context *ev,
2362                                    struct timed_event *te,
2363                                    struct timeval t, void *p)
2364 {
2365         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2366
2367         if (rec->force_rebalance_nodes == NULL) {
2368                 DEBUG(DEBUG_ERR,
2369                       ("Rebalance timeout occurred - no nodes to rebalance\n"));
2370                 return;
2371         }
2372
2373         DEBUG(DEBUG_NOTICE,
2374               ("Rebalance timeout occurred - do takeover run\n"));
2375         do_takeover_run(rec, rec->nodemap, false);
2376 }
2377
2378
2379 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2380                                         uint64_t srvid,
2381                                         TDB_DATA data, void *private_data)
2382 {
2383         uint32_t pnn;
2384         uint32_t *t;
2385         int len;
2386         uint32_t deferred_rebalance;
2387         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2388
2389         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2390                 return;
2391         }
2392
2393         if (data.dsize != sizeof(uint32_t)) {
2394                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2395                 return;
2396         }
2397
2398         pnn = *(uint32_t *)&data.dptr[0];
2399
2400         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2401
2402         /* Copy any existing list of nodes.  There's probably some
2403          * sort of realloc variant that will do this but we need to
2404          * make sure that freeing the old array also cancels the timer
2405          * event for the timeout... not sure if realloc will do that.
2406          */
2407         len = (rec->force_rebalance_nodes != NULL) ?
2408                 talloc_array_length(rec->force_rebalance_nodes) :
2409                 0;
2410
2411         /* This allows duplicates to be added but they don't cause
2412          * harm.  A call to add a duplicate PNN arguably means that
2413          * the timeout should be reset, so this is the simplest
2414          * solution.
2415          */
2416         t = talloc_zero_array(rec, uint32_t, len+1);
2417         CTDB_NO_MEMORY_VOID(ctdb, t);
2418         if (len > 0) {
2419                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2420         }
2421         t[len] = pnn;
2422
2423         talloc_free(rec->force_rebalance_nodes);
2424
2425         rec->force_rebalance_nodes = t;
2426
2427         /* If configured, setup a deferred takeover run to make sure
2428          * that certain nodes get IPs rebalanced to them.  This will
2429          * be cancelled if a successful takeover run happens before
2430          * the timeout.  Assign tunable value to variable for
2431          * readability.
2432          */
2433         deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2434         if (deferred_rebalance != 0) {
2435                 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2436                                 timeval_current_ofs(deferred_rebalance, 0),
2437                                 ctdb_rebalance_timeout, rec);
2438         }
2439 }
2440
2441
2442
2443 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2444                              TDB_DATA data, void *private_data)
2445 {
2446         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2447         struct ctdb_public_ip *ip;
2448
2449         if (rec->recmaster != rec->ctdb->pnn) {
2450                 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2451                 return;
2452         }
2453
2454         if (data.dsize != sizeof(struct ctdb_public_ip)) {
2455                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2456                 return;
2457         }
2458
2459         ip = (struct ctdb_public_ip *)data.dptr;
2460
2461         update_ip_assignment_tree(rec->ctdb, ip);
2462 }
2463
2464
2465 static void clear_takeover_runs_disable(struct ctdb_recoverd *rec)
2466 {
2467         TALLOC_FREE(rec->takeover_runs_disable_ctx);
2468 }
2469
2470 static void reenable_takeover_runs(struct event_context *ev,
2471                                    struct timed_event *te,
2472                                    struct timeval yt, void *p)
2473 {
2474         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2475
2476         DEBUG(DEBUG_NOTICE,("Reenabling takeover runs after timeout\n"));
2477         clear_takeover_runs_disable(rec);
2478 }
2479
2480 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2481                                           uint64_t srvid, TDB_DATA data,
2482                                           void *private_data)
2483 {
2484         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2485                                                     struct ctdb_recoverd);
2486         struct srvid_request_data *r;
2487         uint32_t timeout;
2488         TDB_DATA result;
2489         int32_t ret = 0;
2490
2491         /* Validate input data */
2492         if (data.dsize != sizeof(struct srvid_request_data)) {
2493                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2494                                  "expecting %lu\n", (long unsigned)data.dsize,
2495                                  (long unsigned)sizeof(struct srvid_request)));
2496                 return;
2497         }
2498         if (data.dptr == NULL) {
2499                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2500                 return;
2501         }
2502
2503         r = (struct srvid_request_data *)data.dptr;
2504         timeout = r->data;
2505
2506         if (timeout == 0) {
2507                 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs\n"));
2508                 clear_takeover_runs_disable(rec);
2509                 ret = ctdb_get_pnn(ctdb);
2510                 goto done;
2511         }
2512
2513         if (rec->takeover_run_in_progress) {
2514                 DEBUG(DEBUG_ERR,
2515                       ("Unable to disable takeover runs - in progress\n"));
2516                 ret = -EAGAIN;
2517                 goto done;
2518         }
2519
2520         DEBUG(DEBUG_NOTICE,("Disabling takeover runs for %u seconds\n", timeout));
2521
2522         /* Clear any old timers */
2523         clear_takeover_runs_disable(rec);
2524
2525         /* When this is non-NULL it indicates that takeover runs are
2526          * disabled.  This context also holds the timeout timer.
2527          */
2528         rec->takeover_runs_disable_ctx = talloc_new(rec);
2529         if (rec->takeover_runs_disable_ctx == NULL) {
2530                 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate memory\n"));
2531                 ret = -ENOMEM;
2532                 goto done;
2533         }
2534
2535         /* Arrange for the timeout to occur */
2536         event_add_timed(ctdb->ev, rec->takeover_runs_disable_ctx,
2537                         timeval_current_ofs(timeout, 0),
2538                         reenable_takeover_runs,
2539                         rec);
2540
2541         /* Returning our PNN tells the caller that we succeeded */
2542         ret = ctdb_get_pnn(ctdb);
2543 done:
2544         result.dsize = sizeof(int32_t);
2545         result.dptr  = (uint8_t *)&ret;
2546         srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2547 }
2548
2549 /* Backward compatibility for this SRVID - call
2550  * disable_takeover_runs_handler() instead
2551  */
2552 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2553                                      TDB_DATA data, void *private_data)
2554 {
2555         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2556                                                     struct ctdb_recoverd);
2557         TDB_DATA data2;
2558         struct srvid_request_data *req;
2559
2560         if (data.dsize != sizeof(uint32_t)) {
2561                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2562                                  "expecting %lu\n", (long unsigned)data.dsize,
2563                                  (long unsigned)sizeof(uint32_t)));
2564                 return;
2565         }
2566         if (data.dptr == NULL) {
2567                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2568                 return;
2569         }
2570
2571         req = talloc(ctdb, struct srvid_request_data);
2572         CTDB_NO_MEMORY_VOID(ctdb, req);
2573
2574         req->srvid = 0; /* No reply */
2575         req->pnn = -1;
2576         req->data = *((uint32_t *)data.dptr); /* Timeout */
2577
2578         data2.dsize = sizeof(*req);
2579         data2.dptr = (uint8_t *)req;
2580
2581         disable_takeover_runs_handler(rec->ctdb,
2582                                       CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2583                                       data2, rec);
2584 }
2585
2586 /*
2587   handler for ip reallocate, just add it to the list of requests and
2588   handle this later in the monitor_cluster loop so we do not recurse
2589   with other requests to takeover_run()
2590 */
2591 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2592                                   TDB_DATA data, void *private_data)
2593 {
2594         struct srvid_request *request;
2595         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2596                                                     struct ctdb_recoverd);
2597
2598         if (data.dsize != sizeof(struct srvid_request)) {
2599                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2600                 return;
2601         }
2602
2603         request = (struct srvid_request *)data.dptr;
2604
2605         srvid_request_add(ctdb, &rec->reallocate_requests, request);
2606 }
2607
2608 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2609                                           struct ctdb_recoverd *rec)
2610 {
2611         TDB_DATA result;
2612         int32_t ret;
2613         uint32_t culprit;
2614         struct srvid_requests *current;
2615
2616         DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2617
2618         /* Only process requests that are currently pending.  More
2619          * might come in while the takeover run is in progress and
2620          * they will need to be processed later since they might
2621          * be in response flag changes.
2622          */
2623         current = rec->reallocate_requests;
2624         rec->reallocate_requests = NULL;
2625
2626         /* update the list of public ips that a node can handle for
2627            all connected nodes
2628         */
2629         ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2630         if (ret != 0) {
2631                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2632                                  culprit));
2633                 rec->need_takeover_run = true;
2634         }
2635         if (ret == 0) {
2636                 if (do_takeover_run(rec, rec->nodemap, false)) {
2637                         ret = ctdb_get_pnn(ctdb);
2638                 } else {
2639                         ret = -1;
2640                 }
2641         }
2642
2643         result.dsize = sizeof(int32_t);
2644         result.dptr  = (uint8_t *)&ret;
2645
2646         srvid_requests_reply(ctdb, &current, result);
2647 }
2648
2649
2650 /*
2651   handler for recovery master elections
2652 */
2653 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2654                              TDB_DATA data, void *private_data)
2655 {
2656         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2657         int ret;
2658         struct election_message *em = (struct election_message *)data.dptr;
2659         TALLOC_CTX *mem_ctx;
2660
2661         /* Ignore election packets from ourself */
2662         if (ctdb->pnn == em->pnn) {
2663                 return;
2664         }
2665
2666         /* we got an election packet - update the timeout for the election */
2667         talloc_free(rec->election_timeout);
2668         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2669                                                 fast_start ?
2670                                                 timeval_current_ofs(0, 500000) :
2671                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2672                                                 ctdb_election_timeout, rec);
2673
2674         mem_ctx = talloc_new(ctdb);
2675
2676         /* someone called an election. check their election data
2677            and if we disagree and we would rather be the elected node,
2678            send a new election message to all other nodes
2679          */
2680         if (ctdb_election_win(rec, em)) {
2681                 if (!rec->send_election_te) {
2682                         rec->send_election_te = event_add_timed(ctdb->ev, rec,
2683                                                                 timeval_current_ofs(0, 500000),
2684                                                                 election_send_request, rec);
2685                 }
2686                 talloc_free(mem_ctx);
2687                 /*unban_all_nodes(ctdb);*/
2688                 return;
2689         }
2690
2691         /* we didn't win */
2692         talloc_free(rec->send_election_te);
2693         rec->send_election_te = NULL;
2694
2695         if (ctdb->recovery_lock_file != NULL) {
2696                 /* Release the recovery lock file */
2697                 if (em->pnn != ctdb->pnn &&
2698                     ctdb_recovery_have_lock(ctdb)) {
2699                         ctdb_recovery_unlock(ctdb);
2700                         unban_all_nodes(ctdb);
2701                 }
2702         }
2703
2704         /* ok, let that guy become recmaster then */
2705         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2706         if (ret != 0) {
2707                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2708                 talloc_free(mem_ctx);
2709                 return;
2710         }
2711
2712         talloc_free(mem_ctx);
2713         return;
2714 }
2715
2716
2717 /*
2718   force the start of the election process
2719  */
2720 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2721                            struct ctdb_node_map *nodemap)
2722 {
2723         int ret;
2724         struct ctdb_context *ctdb = rec->ctdb;
2725
2726         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2727
2728         /* set all nodes to recovery mode to stop all internode traffic */
2729         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2730         if (ret != 0) {
2731                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2732                 return;
2733         }
2734
2735         talloc_free(rec->election_timeout);
2736         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2737                                                 fast_start ?
2738                                                 timeval_current_ofs(0, 500000) :
2739                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2740                                                 ctdb_election_timeout, rec);
2741
2742         ret = send_election_request(rec, pnn);
2743         if (ret!=0) {
2744                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2745                 return;
2746         }
2747
2748         /* wait for a few seconds to collect all responses */
2749         ctdb_wait_election(rec);
2750 }
2751
2752
2753
2754 /*
2755   handler for when a node changes its flags
2756 */
2757 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2758                             TDB_DATA data, void *private_data)
2759 {
2760         int ret;
2761         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2762         struct ctdb_node_map *nodemap=NULL;
2763         TALLOC_CTX *tmp_ctx;
2764         int i;
2765         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2766         int disabled_flag_changed;
2767
2768         if (data.dsize != sizeof(*c)) {
2769                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2770                 return;
2771         }
2772
2773         tmp_ctx = talloc_new(ctdb);
2774         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2775
2776         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2777         if (ret != 0) {
2778                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2779                 talloc_free(tmp_ctx);
2780                 return;
2781         }
2782
2783
2784         for (i=0;i<nodemap->num;i++) {
2785                 if (nodemap->nodes[i].pnn == c->pnn) break;
2786         }
2787
2788         if (i == nodemap->num) {
2789                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2790                 talloc_free(tmp_ctx);
2791                 return;
2792         }
2793
2794         if (c->old_flags != c->new_flags) {
2795                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2796         }
2797
2798         disabled_flag_changed =  (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2799
2800         nodemap->nodes[i].flags = c->new_flags;
2801
2802         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2803                                      CTDB_CURRENT_NODE, &ctdb->recovery_master);
2804
2805         if (ret == 0) {
2806                 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2807                                            CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2808         }
2809
2810         if (ret == 0 &&
2811             ctdb->recovery_master == ctdb->pnn &&
2812             ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2813                 /* Only do the takeover run if the perm disabled or unhealthy
2814                    flags changed since these will cause an ip failover but not
2815                    a recovery.
2816                    If the node became disconnected or banned this will also
2817                    lead to an ip address failover but that is handled
2818                    during recovery
2819                 */
2820                 if (disabled_flag_changed) {
2821                         rec->need_takeover_run = true;
2822                 }
2823         }
2824
2825         talloc_free(tmp_ctx);
2826 }
2827
2828 /*
2829   handler for when we need to push out flag changes ot all other nodes
2830 */
2831 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2832                             TDB_DATA data, void *private_data)
2833 {
2834         int ret;
2835         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2836         struct ctdb_node_map *nodemap=NULL;
2837         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2838         uint32_t recmaster;
2839         uint32_t *nodes;
2840
2841         /* find the recovery master */
2842         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2843         if (ret != 0) {
2844                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2845                 talloc_free(tmp_ctx);
2846                 return;
2847         }
2848
2849         /* read the node flags from the recmaster */
2850         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2851         if (ret != 0) {
2852                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2853                 talloc_free(tmp_ctx);
2854                 return;
2855         }
2856         if (c->pnn >= nodemap->num) {
2857                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2858                 talloc_free(tmp_ctx);
2859                 return;
2860         }
2861
2862         /* send the flags update to all connected nodes */
2863         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2864
2865         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2866                                       nodes, 0, CONTROL_TIMEOUT(),
2867                                       false, data,
2868                                       NULL, NULL,
2869                                       NULL) != 0) {
2870                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2871
2872                 talloc_free(tmp_ctx);
2873                 return;
2874         }
2875
2876         talloc_free(tmp_ctx);
2877 }
2878
2879
2880 struct verify_recmode_normal_data {
2881         uint32_t count;
2882         enum monitor_result status;
2883 };
2884
2885 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2886 {
2887         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2888
2889
2890         /* one more node has responded with recmode data*/
2891         rmdata->count--;
2892
2893         /* if we failed to get the recmode, then return an error and let
2894            the main loop try again.
2895         */
2896         if (state->state != CTDB_CONTROL_DONE) {
2897                 if (rmdata->status == MONITOR_OK) {
2898                         rmdata->status = MONITOR_FAILED;
2899                 }
2900                 return;
2901         }
2902
2903         /* if we got a response, then the recmode will be stored in the
2904            status field
2905         */
2906         if (state->status != CTDB_RECOVERY_NORMAL) {
2907                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2908                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2909         }
2910
2911         return;
2912 }
2913
2914
2915 /* verify that all nodes are in normal recovery mode */
2916 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2917 {
2918         struct verify_recmode_normal_data *rmdata;
2919         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2920         struct ctdb_client_control_state *state;
2921         enum monitor_result status;
2922         int j;
2923
2924         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2925         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2926         rmdata->count  = 0;
2927         rmdata->status = MONITOR_OK;
2928
2929         /* loop over all active nodes and send an async getrecmode call to
2930            them*/
2931         for (j=0; j<nodemap->num; j++) {
2932                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2933                         continue;
2934                 }
2935                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2936                                         CONTROL_TIMEOUT(),
2937                                         nodemap->nodes[j].pnn);
2938                 if (state == NULL) {
2939                         /* we failed to send the control, treat this as
2940                            an error and try again next iteration
2941                         */
2942                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2943                         talloc_free(mem_ctx);
2944                         return MONITOR_FAILED;
2945                 }
2946
2947                 /* set up the callback functions */
2948                 state->async.fn = verify_recmode_normal_callback;
2949                 state->async.private_data = rmdata;
2950
2951                 /* one more control to wait for to complete */
2952                 rmdata->count++;
2953         }
2954
2955
2956         /* now wait for up to the maximum number of seconds allowed
2957            or until all nodes we expect a response from has replied
2958         */
2959         while (rmdata->count > 0) {
2960                 event_loop_once(ctdb->ev);
2961         }
2962
2963         status = rmdata->status;
2964         talloc_free(mem_ctx);
2965         return status;
2966 }
2967
2968
2969 struct verify_recmaster_data {
2970         struct ctdb_recoverd *rec;
2971         uint32_t count;
2972         uint32_t pnn;
2973         enum monitor_result status;
2974 };
2975
2976 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2977 {
2978         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2979
2980
2981         /* one more node has responded with recmaster data*/
2982         rmdata->count--;
2983
2984         /* if we failed to get the recmaster, then return an error and let
2985            the main loop try again.
2986         */
2987         if (state->state != CTDB_CONTROL_DONE) {
2988                 if (rmdata->status == MONITOR_OK) {
2989                         rmdata->status = MONITOR_FAILED;
2990                 }
2991                 return;
2992         }
2993
2994         /* if we got a response, then the recmaster will be stored in the
2995            status field
2996         */
2997         if (state->status != rmdata->pnn) {
2998                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2999                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3000                 rmdata->status = MONITOR_ELECTION_NEEDED;
3001         }
3002
3003         return;
3004 }
3005
3006
3007 /* verify that all nodes agree that we are the recmaster */
3008 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
3009 {
3010         struct ctdb_context *ctdb = rec->ctdb;
3011         struct verify_recmaster_data *rmdata;
3012         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3013         struct ctdb_client_control_state *state;
3014         enum monitor_result status;
3015         int j;
3016
3017         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3018         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3019         rmdata->rec    = rec;
3020         rmdata->count  = 0;
3021         rmdata->pnn    = pnn;
3022         rmdata->status = MONITOR_OK;
3023
3024         /* loop over all active nodes and send an async getrecmaster call to
3025            them*/
3026         for (j=0; j<nodemap->num; j++) {
3027                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3028                         continue;
3029                 }
3030                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3031                                         CONTROL_TIMEOUT(),
3032                                         nodemap->nodes[j].pnn);
3033                 if (state == NULL) {
3034                         /* we failed to send the control, treat this as
3035                            an error and try again next iteration
3036                         */
3037                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3038                         talloc_free(mem_ctx);
3039                         return MONITOR_FAILED;
3040                 }
3041
3042                 /* set up the callback functions */
3043                 state->async.fn = verify_recmaster_callback;
3044                 state->async.private_data = rmdata;
3045
3046                 /* one more control to wait for to complete */
3047                 rmdata->count++;
3048         }
3049
3050
3051         /* now wait for up to the maximum number of seconds allowed
3052            or until all nodes we expect a response from has replied
3053         */
3054         while (rmdata->count > 0) {
3055                 event_loop_once(ctdb->ev);
3056         }
3057
3058         status = rmdata->status;
3059         talloc_free(mem_ctx);
3060         return status;
3061 }
3062
3063 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3064                                     struct ctdb_recoverd *rec)
3065 {
3066         struct ctdb_control_get_ifaces *ifaces = NULL;
3067         TALLOC_CTX *mem_ctx;
3068         bool ret = false;
3069
3070         mem_ctx = talloc_new(NULL);
3071
3072         /* Read the interfaces from the local node */
3073         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3074                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3075                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3076                 /* We could return an error.  However, this will be
3077                  * rare so we'll decide that the interfaces have
3078                  * actually changed, just in case.
3079                  */
3080                 talloc_free(mem_ctx);
3081                 return true;
3082         }
3083
3084         if (!rec->ifaces) {
3085                 /* We haven't been here before so things have changed */
3086                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3087                 ret = true;
3088         } else if (rec->ifaces->num != ifaces->num) {
3089                 /* Number of interfaces has changed */
3090                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3091                                      rec->ifaces->num, ifaces->num));
3092                 ret = true;
3093         } else {
3094                 /* See if interface names or link states have changed */
3095                 int i;
3096                 for (i = 0; i < rec->ifaces->num; i++) {
3097                         struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3098                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3099                                 DEBUG(DEBUG_NOTICE,
3100                                       ("Interface in slot %d changed: %s => %s\n",
3101                                        i, iface->name, ifaces->ifaces[i].name));
3102                                 ret = true;
3103                                 break;
3104                         }
3105                         if (iface->link_state != ifaces->ifaces[i].link_state) {
3106                                 DEBUG(DEBUG_NOTICE,
3107                                       ("Interface %s changed state: %d => %d\n",
3108                                        iface->name, iface->link_state,
3109                                        ifaces->ifaces[i].link_state));
3110                                 ret = true;
3111                                 break;
3112                         }
3113                 }
3114         }
3115
3116         talloc_free(rec->ifaces);
3117         rec->ifaces = talloc_steal(rec, ifaces);
3118
3119         talloc_free(mem_ctx);
3120         return ret;
3121 }
3122
3123 /* called to check that the local allocation of public ip addresses is ok.
3124 */
3125 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3126 {
3127         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3128         struct ctdb_uptime *uptime1 = NULL;
3129         struct ctdb_uptime *uptime2 = NULL;
3130         int ret, j;
3131         bool need_takeover_run = false;
3132
3133         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3134                                 CTDB_CURRENT_NODE, &uptime1);
3135         if (ret != 0) {
3136                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3137                 talloc_free(mem_ctx);
3138                 return -1;
3139         }
3140
3141         if (interfaces_have_changed(ctdb, rec)) {
3142                 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3143                                      "local node %u - force takeover run\n",
3144                                      pnn));
3145                 need_takeover_run = true;
3146         }
3147
3148         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3149                                 CTDB_CURRENT_NODE, &uptime2);
3150         if (ret != 0) {
3151                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3152                 talloc_free(mem_ctx);
3153                 return -1;
3154         }
3155
3156         /* skip the check if the startrecovery time has changed */
3157         if (timeval_compare(&uptime1->last_recovery_started,
3158                             &uptime2->last_recovery_started) != 0) {
3159                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3160                 talloc_free(mem_ctx);
3161                 return 0;
3162         }
3163
3164         /* skip the check if the endrecovery time has changed */
3165         if (timeval_compare(&uptime1->last_recovery_finished,
3166                             &uptime2->last_recovery_finished) != 0) {
3167                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3168                 talloc_free(mem_ctx);
3169                 return 0;
3170         }
3171
3172         /* skip the check if we have started but not finished recovery */
3173         if (timeval_compare(&uptime1->last_recovery_finished,
3174                             &uptime1->last_recovery_started) != 1) {
3175                 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3176                 talloc_free(mem_ctx);
3177
3178                 return 0;
3179         }
3180
3181         /* verify that we have the ip addresses we should have
3182            and we dont have ones we shouldnt have.
3183            if we find an inconsistency we set recmode to
3184            active on the local node and wait for the recmaster
3185            to do a full blown recovery.
3186            also if the pnn is -1 and we are healthy and can host the ip
3187            we also request a ip reallocation.
3188         */
3189         if (ctdb->tunable.disable_ip_failover == 0) {
3190                 struct ctdb_all_public_ips *ips = NULL;
3191
3192                 /* read the *available* IPs from the local node */
3193                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3194                 if (ret != 0) {
3195                         DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3196                         talloc_free(mem_ctx);
3197                         return -1;
3198                 }
3199
3200                 for (j=0; j<ips->num; j++) {
3201                         if (ips->ips[j].pnn == -1 &&
3202                             nodemap->nodes[pnn].flags == 0) {
3203                                 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3204                                                   ctdb_addr_to_str(&ips->ips[j].addr)));
3205                                 need_takeover_run = true;
3206                         }
3207                 }
3208
3209                 talloc_free(ips);
3210
3211                 /* read the *known* IPs from the local node */
3212                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3213                 if (ret != 0) {
3214                         DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3215                         talloc_free(mem_ctx);
3216                         return -1;
3217                 }
3218
3219                 for (j=0; j<ips->num; j++) {
3220                         if (ips->ips[j].pnn == pnn) {
3221                                 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3222                                         DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3223                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3224                                         need_takeover_run = true;
3225                                 }
3226                         } else {
3227                                 if (ctdb->do_checkpublicip &&
3228                                     ctdb_sys_have_ip(&ips->ips[j].addr)) {
3229
3230                                         DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3231                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3232
3233                                         if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3234                                                 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3235                                         }
3236                                 }
3237                         }
3238                 }
3239         }
3240
3241         if (need_takeover_run) {
3242                 struct srvid_request rd;
3243                 TDB_DATA data;
3244
3245                 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3246
3247                 rd.pnn = ctdb->pnn;
3248                 rd.srvid = 0;
3249                 data.dptr = (uint8_t *)&rd;
3250                 data.dsize = sizeof(rd);
3251
3252                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3253                 if (ret != 0) {
3254                         DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3255                 }
3256         }
3257         talloc_free(mem_ctx);
3258         return 0;
3259 }
3260
3261
3262 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3263 {
3264         struct ctdb_node_map **remote_nodemaps = callback_data;
3265
3266         if (node_pnn >= ctdb->num_nodes) {
3267                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3268                 return;
3269         }
3270
3271         remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3272
3273 }
3274
3275 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3276         struct ctdb_node_map *nodemap,
3277         struct ctdb_node_map **remote_nodemaps)
3278 {
3279         uint32_t *nodes;
3280
3281         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3282         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3283                                         nodes, 0,
3284                                         CONTROL_TIMEOUT(), false, tdb_null,
3285                                         async_getnodemap_callback,
3286                                         NULL,
3287                                         remote_nodemaps) != 0) {
3288                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3289
3290                 return -1;
3291         }
3292
3293         return 0;
3294 }
3295
3296 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3297 {
3298         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3299         const char *reclockfile;
3300
3301         if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3302                 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3303                 talloc_free(tmp_ctx);
3304                 return -1;
3305         }
3306
3307         if (reclockfile == NULL) {
3308                 if (ctdb->recovery_lock_file != NULL) {
3309                         DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3310                         talloc_free(ctdb->recovery_lock_file);
3311                         ctdb->recovery_lock_file = NULL;
3312                         ctdb_recovery_unlock(ctdb);
3313                 }
3314                 talloc_free(tmp_ctx);
3315                 return 0;
3316         }
3317
3318         if (ctdb->recovery_lock_file == NULL) {
3319                 DEBUG(DEBUG_NOTICE,
3320                       ("Recovery lock file enabled (%s)\n", reclockfile));
3321                 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3322                 ctdb_recovery_unlock(ctdb);
3323                 talloc_free(tmp_ctx);
3324                 return 0;
3325         }
3326
3327
3328         if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3329                 talloc_free(tmp_ctx);
3330                 return 0;
3331         }
3332
3333         DEBUG(DEBUG_NOTICE,
3334               ("Recovery lock file changed (now %s)\n", reclockfile));
3335         talloc_free(ctdb->recovery_lock_file);
3336         ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3337         ctdb_recovery_unlock(ctdb);
3338
3339         talloc_free(tmp_ctx);
3340         return 0;
3341 }
3342
3343 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3344                       TALLOC_CTX *mem_ctx)
3345 {
3346         uint32_t pnn;
3347         struct ctdb_node_map *nodemap=NULL;
3348         struct ctdb_node_map *recmaster_nodemap=NULL;
3349         struct ctdb_node_map **remote_nodemaps=NULL;
3350         struct ctdb_vnn_map *vnnmap=NULL;
3351         struct ctdb_vnn_map *remote_vnnmap=NULL;
3352         int32_t debug_level;
3353         int i, j, ret;
3354         bool self_ban;
3355
3356
3357         /* verify that the main daemon is still running */
3358         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3359                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3360                 exit(-1);
3361         }
3362
3363         /* ping the local daemon to tell it we are alive */
3364         ctdb_ctrl_recd_ping(ctdb);
3365
3366         if (rec->election_timeout) {
3367                 /* an election is in progress */
3368                 return;
3369         }
3370
3371         /* read the debug level from the parent and update locally */
3372         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3373         if (ret !=0) {
3374                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3375                 return;
3376         }
3377         DEBUGLEVEL = debug_level;
3378
3379         /* get relevant tunables */
3380         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3381         if (ret != 0) {
3382                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3383                 return;
3384         }
3385
3386         /* get runstate */
3387         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3388                                      CTDB_CURRENT_NODE, &ctdb->runstate);
3389         if (ret != 0) {
3390                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3391                 return;
3392         }
3393
3394         /* get the current recovery lock file from the server */
3395         if (update_recovery_lock_file(ctdb) != 0) {
3396                 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3397                 return;
3398         }
3399
3400         /* Make sure that if recovery lock verification becomes disabled when
3401            we close the file
3402         */
3403         if (ctdb->recovery_lock_file == NULL) {
3404                 ctdb_recovery_unlock(ctdb);
3405         }
3406
3407         pnn = ctdb_get_pnn(ctdb);
3408
3409         /* get the vnnmap */
3410         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3411         if (ret != 0) {
3412                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3413                 return;
3414         }
3415
3416
3417         /* get number of nodes */
3418         if (rec->nodemap) {
3419                 talloc_free(rec->nodemap);
3420                 rec->nodemap = NULL;
3421                 nodemap=NULL;
3422         }
3423         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3424         if (ret != 0) {
3425                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3426                 return;
3427         }
3428         nodemap = rec->nodemap;
3429
3430         /* remember our own node flags */
3431         rec->node_flags = nodemap->nodes[pnn].flags;
3432
3433         ban_misbehaving_nodes(rec, &self_ban);
3434         if (self_ban) {
3435                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3436                 return;
3437         }
3438
3439         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3440            also frozen and that the recmode is set to active.
3441         */
3442         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3443                 /* If this node has become inactive then we want to
3444                  * reduce the chances of it taking over the recovery
3445                  * master role when it becomes active again.  This
3446                  * helps to stabilise the recovery master role so that
3447                  * it stays on the most stable node.
3448                  */
3449                 rec->priority_time = timeval_current();
3450
3451                 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3452                 if (ret != 0) {
3453                         DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3454                 }
3455                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3456                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3457
3458                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3459                         if (ret != 0) {
3460                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3461
3462                                 return;
3463                         }
3464                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3465                         if (ret != 0) {
3466                                 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3467                                 return;
3468                         }
3469                 }
3470
3471                 /* If this node is stopped or banned then it is not the recovery
3472                  * master, so don't do anything. This prevents stopped or banned
3473                  * node from starting election and sending unnecessary controls.
3474                  */
3475                 return;
3476         }
3477
3478         /* check which node is the recovery master */
3479         ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3480         if (ret != 0) {
3481                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3482                 return;
3483         }
3484
3485         /* If we are not the recmaster then do some housekeeping */
3486         if (rec->recmaster != pnn) {
3487                 /* Ignore any IP reallocate requests - only recmaster
3488                  * processes them
3489                  */
3490                 TALLOC_FREE(rec->reallocate_requests);
3491                 /* Clear any nodes that should be force rebalanced in
3492                  * the next takeover run.  If the recovery master role
3493                  * has moved then we don't want to process these some
3494                  * time in the future.
3495                  */
3496                 TALLOC_FREE(rec->force_rebalance_nodes);
3497         }
3498
3499         /* This is a special case.  When recovery daemon is started, recmaster
3500          * is set to -1.  If a node is not started in stopped state, then
3501          * start election to decide recovery master
3502          */
3503         if (rec->recmaster == (uint32_t)-1) {
3504                 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3505                 force_election(rec, pnn, nodemap);
3506                 return;
3507         }
3508
3509         /* update the capabilities for all nodes */
3510         ret = update_capabilities(ctdb, nodemap);
3511         if (ret != 0) {
3512                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3513                 return;
3514         }
3515
3516         /*
3517          * If the current recmaster does not have CTDB_CAP_RECMASTER,
3518          * but we have, then force an election and try to become the new
3519          * recmaster.
3520          */
3521         if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3522             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3523              !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3524                 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3525                                   " but we (node %u) have - force an election\n",
3526                                   rec->recmaster, pnn));
3527                 force_election(rec, pnn, nodemap);
3528                 return;
3529         }
3530
3531         /* count how many active nodes there are */
3532         rec->num_active    = 0;
3533         rec->num_lmasters  = 0;
3534         rec->num_connected = 0;
3535         for (i=0; i<nodemap->num; i++) {
3536                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3537                         rec->num_active++;
3538                         if (rec->ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER) {
3539                                 rec->num_lmasters++;
3540                         }
3541                 }
3542                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3543                         rec->num_connected++;
3544                 }
3545         }
3546
3547
3548         /* verify that the recmaster node is still active */
3549         for (j=0; j<nodemap->num; j++) {
3550                 if (nodemap->nodes[j].pnn==rec->recmaster) {
3551                         break;
3552                 }
3553         }
3554
3555         if (j == nodemap->num) {
3556                 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3557                 force_election(rec, pnn, nodemap);
3558                 return;
3559         }
3560
3561         /* if recovery master is disconnected we must elect a new recmaster */
3562         if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3563                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3564                 force_election(rec, pnn, nodemap);
3565                 return;
3566         }
3567
3568         /* get nodemap from the recovery master to check if it is inactive */
3569         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3570                                    mem_ctx, &recmaster_nodemap);
3571         if (ret != 0) {
3572                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3573                           nodemap->nodes[j].pnn));
3574                 return;
3575         }
3576
3577
3578         if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3579             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3580                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3581                 /*
3582                  * update our nodemap to carry the recmaster's notion of
3583                  * its own flags, so that we don't keep freezing the
3584                  * inactive recmaster node...
3585                  */
3586                 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3587                 force_election(rec, pnn, nodemap);
3588                 return;
3589         }
3590
3591         /* verify that we have all ip addresses we should have and we dont
3592          * have addresses we shouldnt have.
3593          */
3594         if (ctdb->tunable.disable_ip_failover == 0 &&
3595             rec->takeover_runs_disable_ctx == NULL) {
3596                 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3597                         DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3598                 }
3599         }
3600
3601
3602         /* if we are not the recmaster then we do not need to check
3603            if recovery is needed
3604          */
3605         if (pnn != rec->recmaster) {
3606                 return;
3607         }
3608
3609
3610         /* ensure our local copies of flags are right */
3611         ret = update_local_flags(rec, nodemap);
3612         if (ret == MONITOR_ELECTION_NEEDED) {
3613                 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3614                 force_election(rec, pnn, nodemap);
3615                 return;
3616         }
3617         if (ret != MONITOR_OK) {
3618                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3619                 return;
3620         }
3621
3622         if (ctdb->num_nodes != nodemap->num) {
3623                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3624                 ctdb_load_nodes_file(ctdb);
3625                 return;
3626         }
3627
3628         /* verify that all active nodes agree that we are the recmaster */
3629         switch (verify_recmaster(rec, nodemap, pnn)) {
3630         case MONITOR_RECOVERY_NEEDED:
3631                 /* can not happen */
3632                 return;
3633         case MONITOR_ELECTION_NEEDED:
3634                 force_election(rec, pnn, nodemap);
3635                 return;
3636         case MONITOR_OK:
3637                 break;
3638         case MONITOR_FAILED:
3639                 return;
3640         }
3641
3642
3643         if (rec->need_recovery) {
3644                 /* a previous recovery didn't finish */
3645                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3646                 return;
3647         }
3648
3649         /* verify that all active nodes are in normal mode
3650            and not in recovery mode
3651         */
3652         switch (verify_recmode(ctdb, nodemap)) {
3653         case MONITOR_RECOVERY_NEEDED:
3654                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3655                 return;
3656         case MONITOR_FAILED:
3657                 return;
3658         case MONITOR_ELECTION_NEEDED:
3659                 /* can not happen */
3660         case MONITOR_OK:
3661                 break;
3662         }
3663
3664
3665         if (ctdb->recovery_lock_file != NULL) {
3666                 /* We must already hold the recovery lock */
3667                 if (!ctdb_recovery_have_lock(ctdb)) {
3668                         DEBUG(DEBUG_ERR,("Failed recovery lock sanity check.  Force a recovery\n"));
3669                         ctdb_set_culprit(rec, ctdb->pnn);
3670                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3671                         return;
3672                 }
3673         }
3674
3675
3676         /* if there are takeovers requested, perform it and notify the waiters */
3677         if (rec->takeover_runs_disable_ctx == NULL &&
3678             rec->reallocate_requests) {
3679                 process_ipreallocate_requests(ctdb, rec);
3680         }
3681
3682         /* get the nodemap for all active remote nodes
3683          */
3684         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3685         if (remote_nodemaps == NULL) {
3686                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3687                 return;
3688         }
3689         for(i=0; i<nodemap->num; i++) {
3690                 remote_nodemaps[i] = NULL;
3691         }
3692         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3693                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3694                 return;
3695         }
3696
3697         /* verify that all other nodes have the same nodemap as we have
3698         */
3699         for (j=0; j<nodemap->num; j++) {
3700                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3701                         continue;
3702                 }
3703
3704                 if (remote_nodemaps[j] == NULL) {
3705                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3706                         ctdb_set_culprit(rec, j);
3707
3708                         return;
3709                 }
3710
3711                 /* if the nodes disagree on how many nodes there are
3712                    then this is a good reason to try recovery
3713                  */
3714                 if (remote_nodemaps[j]->num != nodemap->num) {
3715                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3716                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3717                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3718                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3719                         return;
3720                 }
3721
3722                 /* if the nodes disagree on which nodes exist and are
3723                    active, then that is also a good reason to do recovery
3724                  */
3725                 for (i=0;i<nodemap->num;i++) {
3726                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3727                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3728                                           nodemap->nodes[j].pnn, i,
3729                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3730                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3731                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3732                                             vnnmap);
3733                                 return;
3734                         }
3735                 }
3736         }
3737
3738         /*
3739          * Update node flags obtained from each active node. This ensure we have
3740          * up-to-date information for all the nodes.
3741          */
3742         for (j=0; j<nodemap->num; j++) {
3743                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3744                         continue;
3745                 }
3746                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3747         }
3748
3749         for (j=0; j<nodemap->num; j++) {
3750                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3751                         continue;
3752                 }
3753
3754                 /* verify the flags are consistent
3755                 */
3756                 for (i=0; i<nodemap->num; i++) {
3757                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3758                                 continue;
3759                         }
3760
3761                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3762                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3763                                   nodemap->nodes[j].pnn,
3764                                   nodemap->nodes[i].pnn,
3765                                   remote_nodemaps[j]->nodes[i].flags,
3766                                   nodemap->nodes[i].flags));
3767                                 if (i == j) {
3768                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3769                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3770                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3771                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3772                                                     vnnmap);
3773                                         return;
3774                                 } else {
3775                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3776                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3777                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3778                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3779                                                     vnnmap);
3780                                         return;
3781                                 }
3782                         }
3783                 }
3784         }
3785
3786
3787         /* There must be the same number of lmasters in the vnn map as
3788          * there are active nodes with the lmaster capability...  or
3789          * do a recovery.
3790          */
3791         if (vnnmap->size != rec->num_lmasters) {
3792                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3793                           vnnmap->size, rec->num_lmasters));
3794                 ctdb_set_culprit(rec, ctdb->pnn);
3795                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3796                 return;
3797         }
3798
3799         /* verify that all active nodes in the nodemap also exist in
3800            the vnnmap.
3801          */
3802         for (j=0; j<nodemap->num; j++) {
3803                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3804                         continue;
3805                 }
3806                 if (nodemap->nodes[j].pnn == pnn) {
3807                         continue;
3808                 }
3809
3810                 for (i=0; i<vnnmap->size; i++) {
3811                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3812                                 break;
3813                         }
3814                 }
3815                 if (i == vnnmap->size) {
3816                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3817                                   nodemap->nodes[j].pnn));
3818                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3819                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3820                         return;
3821                 }
3822         }
3823
3824
3825         /* verify that all other nodes have the same vnnmap
3826            and are from the same generation
3827          */
3828         for (j=0; j<nodemap->num; j++) {
3829                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3830                         continue;
3831                 }
3832                 if (nodemap->nodes[j].pnn == pnn) {
3833                         continue;
3834                 }
3835
3836                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3837                                           mem_ctx, &remote_vnnmap);
3838                 if (ret != 0) {
3839                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3840                                   nodemap->nodes[j].pnn));
3841                         return;
3842                 }
3843
3844                 /* verify the vnnmap generation is the same */
3845                 if (vnnmap->generation != remote_vnnmap->generation) {
3846                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3847                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3848                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3849                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3850                         return;
3851                 }
3852
3853                 /* verify the vnnmap size is the same */
3854                 if (vnnmap->size != remote_vnnmap->size) {
3855                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3856                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3857                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3858                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3859                         return;
3860                 }
3861
3862                 /* verify the vnnmap is the same */
3863                 for (i=0;i<vnnmap->size;i++) {
3864                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3865                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3866                                           nodemap->nodes[j].pnn));
3867                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3868                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3869                                             vnnmap);
3870                                 return;
3871                         }
3872                 }
3873         }
3874
3875         /* we might need to change who has what IP assigned */
3876         if (rec->need_takeover_run) {
3877                 uint32_t culprit = (uint32_t)-1;
3878
3879                 rec->need_takeover_run = false;
3880
3881                 /* update the list of public ips that a node can handle for
3882                    all connected nodes
3883                 */
3884                 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3885                 if (ret != 0) {
3886                         DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3887                                          culprit));
3888                         rec->need_takeover_run = true;
3889                         return;
3890                 }
3891
3892                 /* execute the "startrecovery" event script on all nodes */
3893                 ret = run_startrecovery_eventscript(rec, nodemap);
3894                 if (ret!=0) {
3895                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3896                         ctdb_set_culprit(rec, ctdb->pnn);
3897                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3898                         return;
3899                 }
3900
3901                 /* If takeover run fails, then the offending nodes are
3902                  * assigned ban culprit counts. And we re-try takeover.
3903                  * If takeover run fails repeatedly, the node would get
3904                  * banned.
3905                  *
3906                  * If rec->need_takeover_run is not set to true at this
3907                  * failure, monitoring is disabled cluster-wide (via
3908                  * startrecovery eventscript) and will not get enabled.
3909                  */
3910                 if (!do_takeover_run(rec, nodemap, true)) {
3911                         return;
3912                 }
3913
3914                 /* execute the "recovered" event script on all nodes */
3915                 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3916 #if 0
3917 // we cant check whether the event completed successfully
3918 // since this script WILL fail if the node is in recovery mode
3919 // and if that race happens, the code here would just cause a second
3920 // cascading recovery.
3921                 if (ret!=0) {
3922                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3923                         ctdb_set_culprit(rec, ctdb->pnn);
3924                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3925                 }
3926 #endif
3927         }
3928 }
3929
3930 /*
3931   the main monitoring loop
3932  */
3933 static void monitor_cluster(struct ctdb_context *ctdb)
3934 {
3935         struct ctdb_recoverd *rec;
3936
3937         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3938
3939         rec = talloc_zero(ctdb, struct ctdb_recoverd);
3940         CTDB_NO_MEMORY_FATAL(ctdb, rec);
3941
3942         rec->ctdb = ctdb;
3943
3944         rec->takeover_run_in_progress = false;
3945
3946         rec->priority_time = timeval_current();
3947
3948         /* register a message port for sending memory dumps */
3949         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3950
3951         /* register a message port for recovery elections */
3952         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3953
3954         /* when nodes are disabled/enabled */
3955         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3956
3957         /* when we are asked to puch out a flag change */
3958         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3959
3960         /* register a message port for vacuum fetch */
3961         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3962
3963         /* register a message port for reloadnodes  */
3964         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3965
3966         /* register a message port for performing a takeover run */
3967         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3968
3969         /* register a message port for disabling the ip check for a short while */
3970         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3971
3972         /* register a message port for updating the recovery daemons node assignment for an ip */
3973         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3974
3975         /* register a message port for forcing a rebalance of a node next
3976            reallocation */
3977         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3978
3979         /* Register a message port for disabling takeover runs */
3980         ctdb_client_set_message_handler(ctdb,
3981                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3982                                         disable_takeover_runs_handler, rec);
3983
3984         /* register a message port for detaching database */
3985         ctdb_client_set_message_handler(ctdb,
3986                                         CTDB_SRVID_DETACH_DATABASE,
3987                                         detach_database_handler, rec);
3988
3989         for (;;) {
3990                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3991                 struct timeval start;
3992                 double elapsed;
3993
3994                 if (!mem_ctx) {
3995                         DEBUG(DEBUG_CRIT,(__location__
3996                                           " Failed to create temp context\n"));
3997                         exit(-1);
3998                 }
3999
4000                 start = timeval_current();
4001                 main_loop(ctdb, rec, mem_ctx);
4002                 talloc_free(mem_ctx);
4003
4004                 /* we only check for recovery once every second */
4005                 elapsed = timeval_elapsed(&start);
4006                 if (elapsed < ctdb->tunable.recover_interval) {
4007                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4008                                           - elapsed);
4009                 }
4010         }
4011 }
4012
4013 /*
4014   event handler for when the main ctdbd dies
4015  */
4016 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4017                                  uint16_t flags, void *private_data)
4018 {
4019         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4020         _exit(1);
4021 }
4022
4023 /*
4024   called regularly to verify that the recovery daemon is still running
4025  */
4026 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4027                               struct timeval yt, void *p)
4028 {
4029         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4030
4031         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4032                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4033
4034                 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4035                                 ctdb_restart_recd, ctdb);
4036
4037                 return;
4038         }
4039
4040         event_add_timed(ctdb->ev, ctdb->recd_ctx,
4041                         timeval_current_ofs(30, 0),
4042                         ctdb_check_recd, ctdb);
4043 }
4044
4045 static void recd_sig_child_handler(struct event_context *ev,
4046         struct signal_event *se, int signum, int count,
4047         void *dont_care,
4048         void *private_data)
4049 {
4050 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4051         int status;
4052         pid_t pid = -1;
4053
4054         while (pid != 0) {
4055                 pid = waitpid(-1, &status, WNOHANG);
4056                 if (pid == -1) {
4057                         if (errno != ECHILD) {
4058                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4059                         }
4060                         return;
4061                 }
4062                 if (pid > 0) {
4063                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4064                 }
4065         }
4066 }
4067
4068 /*
4069   startup the recovery daemon as a child of the main ctdb daemon
4070  */
4071 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4072 {
4073         int fd[2];
4074         struct signal_event *se;
4075         struct tevent_fd *fde;
4076
4077         if (pipe(fd) != 0) {
4078                 return -1;
4079         }
4080
4081         ctdb->recoverd_pid = ctdb_fork(ctdb);
4082         if (ctdb->recoverd_pid == -1) {
4083                 return -1;
4084         }
4085
4086         if (ctdb->recoverd_pid != 0) {
4087                 talloc_free(ctdb->recd_ctx);
4088                 ctdb->recd_ctx = talloc_new(ctdb);
4089                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4090
4091                 close(fd[0]);
4092                 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4093                                 timeval_current_ofs(30, 0),
4094                                 ctdb_check_recd, ctdb);
4095                 return 0;
4096         }
4097
4098         close(fd[1]);
4099
4100         srandom(getpid() ^ time(NULL));
4101
4102         ctdb_set_process_name("ctdb_recovered");
4103         if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4104                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4105                 exit(1);
4106         }
4107
4108         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4109
4110         fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4111                      ctdb_recoverd_parent, &fd[0]);
4112         tevent_fd_set_auto_close(fde);
4113
4114         /* set up a handler to pick up sigchld */
4115         se = event_add_signal(ctdb->ev, ctdb,
4116                                      SIGCHLD, 0,
4117                                      recd_sig_child_handler,
4118                                      ctdb);
4119         if (se == NULL) {
4120                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4121                 exit(1);
4122         }
4123
4124         monitor_cluster(ctdb);
4125
4126         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4127         return -1;
4128 }
4129
4130 /*
4131   shutdown the recovery daemon
4132  */
4133 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4134 {
4135         if (ctdb->recoverd_pid == 0) {
4136                 return;
4137         }
4138
4139         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4140         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4141
4142         TALLOC_FREE(ctdb->recd_ctx);
4143         TALLOC_FREE(ctdb->recd_ping_count);
4144 }
4145
4146 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4147                        struct timeval t, void *private_data)
4148 {
4149         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4150
4151         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4152         ctdb_stop_recoverd(ctdb);
4153         ctdb_start_recoverd(ctdb);
4154 }