ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "includes.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25 #include "popt.h"
  26 #include "cmdline.h"
  27 #include "../include/ctdb_client.h"
  28 #include "../include/ctdb_private.h"
  29 #include "db_wrap.h"
  30 #include "dlinklist.h"
  31
  32
  33 /* most recent reload all ips request we need to perform during the
  34    next monitoring loop
  35 */
  36 struct reloadips_all_reply *reload_all_ips_request = NULL;
  37
  38 /* list of "ctdb ipreallocate" processes to call back when we have
  39    finished the takeover run.
  40 */
  41 struct ip_reallocate_list {
  42         struct ip_reallocate_list *next;
  43         struct rd_memdump_reply *rd;
  44 };
  45
  46 struct ctdb_banning_state {
  47         uint32_t count;
  48         struct timeval last_reported_time;
  49 };
  50
  51 /*
  52   private state of recovery daemon
  53  */
  54 struct ctdb_recoverd {
  55         struct ctdb_context *ctdb;
  56         uint32_t recmaster;
  57         uint32_t num_active;
  58         uint32_t num_connected;
  59         uint32_t last_culprit_node;
  60         struct ctdb_node_map *nodemap;
  61         struct timeval priority_time;
  62         bool need_takeover_run;
  63         bool need_recovery;
  64         uint32_t node_flags;
  65         struct timed_event *send_election_te;
  66         struct timed_event *election_timeout;
  67         struct vacuum_info *vacuum_info;
  68         TALLOC_CTX *ip_reallocate_ctx;
  69         struct ip_reallocate_list *reallocate_callers;
  70         TALLOC_CTX *ip_check_disable_ctx;
  71         struct ctdb_control_get_ifaces *ifaces;
  72         TALLOC_CTX *deferred_rebalance_ctx;
  73 };
  74
  75 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
  76 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
  77
  78 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
  79
  80 /*
  81   ban a node for a period of time
  82  */
  83 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
  84 {
  85         int ret;
  86         struct ctdb_context *ctdb = rec->ctdb;
  87         struct ctdb_ban_time bantime;
  88
  89         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
  90
  91         if (!ctdb_validate_pnn(ctdb, pnn)) {
  92                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
  93                 return;
  94         }
  95
  96         bantime.pnn  = pnn;
  97         bantime.time = ban_time;
  98
  99         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 100         if (ret != 0) {
 101                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 102                 return;
 103         }
 104
 105 }
 106
 107 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 108
 109
 110 /*
 111   remember the trouble maker
 112  */
 113 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 114 {
 115         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 116         struct ctdb_banning_state *ban_state;
 117
 118         if (culprit > ctdb->num_nodes) {
 119                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 120                 return;
 121         }
 122
 123         if (ctdb->nodes[culprit]->ban_state == NULL) {
 124                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 125                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 126
 127
 128         }
 129         ban_state = ctdb->nodes[culprit]->ban_state;
 130         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 131                 /* this was the first time in a long while this node
 132                    misbehaved so we will forgive any old transgressions.
 133                 */
 134                 ban_state->count = 0;
 135         }
 136
 137         ban_state->count += count;
 138         ban_state->last_reported_time = timeval_current();
 139         rec->last_culprit_node = culprit;
 140 }
 141
 142 /*
 143   remember the trouble maker
 144  */
 145 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 146 {
 147         ctdb_set_culprit_count(rec, culprit, 1);
 148 }
 149
 150
 151 /* this callback is called for every node that failed to execute the
 152    recovered event
 153 */
 154 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 155 {
 156         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 157
 158         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
 159
 160         ctdb_set_culprit(rec, node_pnn);
 161 }
 162
 163 /*
 164   run the "recovered" eventscript on all nodes
 165  */
 166 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
 167 {
 168         TALLOC_CTX *tmp_ctx;
 169         uint32_t *nodes;
 170         struct ctdb_context *ctdb = rec->ctdb;
 171
 172         tmp_ctx = talloc_new(ctdb);
 173         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 174
 175         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 176         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
 177                                         nodes, 0,
 178                                         CONTROL_TIMEOUT(), false, tdb_null,
 179                                         NULL, recovered_fail_callback,
 180                                         rec) != 0) {
 181                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
 182
 183                 talloc_free(tmp_ctx);
 184                 return -1;
 185         }
 186
 187         talloc_free(tmp_ctx);
 188         return 0;
 189 }
 190
 191 /* this callback is called for every node that failed to execute the
 192    start recovery event
 193 */
 194 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 195 {
 196         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 197
 198         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
 199
 200         ctdb_set_culprit(rec, node_pnn);
 201 }
 202
 203 /*
 204   run the "startrecovery" eventscript on all nodes
 205  */
 206 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
 207 {
 208         TALLOC_CTX *tmp_ctx;
 209         uint32_t *nodes;
 210         struct ctdb_context *ctdb = rec->ctdb;
 211
 212         tmp_ctx = talloc_new(ctdb);
 213         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 214
 215         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 216         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
 217                                         nodes, 0,
 218                                         CONTROL_TIMEOUT(), false, tdb_null,
 219                                         NULL,
 220                                         startrecovery_fail_callback,
 221                                         rec) != 0) {
 222                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
 223                 talloc_free(tmp_ctx);
 224                 return -1;
 225         }
 226
 227         talloc_free(tmp_ctx);
 228         return 0;
 229 }
 230
 231 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 232 {
 233         if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
 234                 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
 235                 return;
 236         }
 237         if (node_pnn < ctdb->num_nodes) {
 238                 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
 239         }
 240
 241         if (node_pnn == ctdb->pnn) {
 242                 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
 243         }
 244 }
 245
 246 /*
 247   update the node capabilities for all connected nodes
 248  */
 249 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 250 {
 251         uint32_t *nodes;
 252         TALLOC_CTX *tmp_ctx;
 253
 254         tmp_ctx = talloc_new(ctdb);
 255         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 256
 257         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 258         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
 259                                         nodes, 0,
 260                                         CONTROL_TIMEOUT(),
 261                                         false, tdb_null,
 262                                         async_getcap_callback, NULL,
 263                                         NULL) != 0) {
 264                 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
 265                 talloc_free(tmp_ctx);
 266                 return -1;
 267         }
 268
 269         talloc_free(tmp_ctx);
 270         return 0;
 271 }
 272
 273 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 274 {
 275         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 276
 277         DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 278         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 279 }
 280
 281 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 282 {
 283         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 284
 285         DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 286         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 287 }
 288
 289 /*
 290   change recovery mode on all nodes
 291  */
 292 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
 293 {
 294         TDB_DATA data;
 295         uint32_t *nodes;
 296         TALLOC_CTX *tmp_ctx;
 297
 298         tmp_ctx = talloc_new(ctdb);
 299         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 300
 301         /* freeze all nodes */
 302         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 303         if (rec_mode == CTDB_RECOVERY_ACTIVE) {
 304                 int i;
 305
 306                 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
 307                         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
 308                                                 nodes, i,
 309                                                 CONTROL_TIMEOUT(),
 310                                                 false, tdb_null,
 311                                                 NULL,
 312                                                 set_recmode_fail_callback,
 313                                                 rec) != 0) {
 314                                 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
 315                                 talloc_free(tmp_ctx);
 316                                 return -1;
 317                         }
 318                 }
 319         }
 320
 321
 322         data.dsize = sizeof(uint32_t);
 323         data.dptr = (unsigned char *)&rec_mode;
 324
 325         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 326                                         nodes, 0,
 327                                         CONTROL_TIMEOUT(),
 328                                         false, data,
 329                                         NULL, NULL,
 330                                         NULL) != 0) {
 331                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 332                 talloc_free(tmp_ctx);
 333                 return -1;
 334         }
 335
 336         talloc_free(tmp_ctx);
 337         return 0;
 338 }
 339
 340 /*
 341   change recovery master on all node
 342  */
 343 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
 344 {
 345         TDB_DATA data;
 346         TALLOC_CTX *tmp_ctx;
 347         uint32_t *nodes;
 348
 349         tmp_ctx = talloc_new(ctdb);
 350         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 351
 352         data.dsize = sizeof(uint32_t);
 353         data.dptr = (unsigned char *)&pnn;
 354
 355         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 356         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
 357                                         nodes, 0,
 358                                         CONTROL_TIMEOUT(), false, data,
 359                                         NULL, NULL,
 360                                         NULL) != 0) {
 361                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
 362                 talloc_free(tmp_ctx);
 363                 return -1;
 364         }
 365
 366         talloc_free(tmp_ctx);
 367         return 0;
 368 }
 369
 370 /* update all remote nodes to use the same db priority that we have
 371    this can fail if the remove node has not yet been upgraded to
 372    support this function, so we always return success and never fail
 373    a recovery if this call fails.
 374 */
 375 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
 376         struct ctdb_node_map *nodemap,
 377         uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 378 {
 379         int db;
 380         uint32_t *nodes;
 381
 382         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
 383
 384         /* step through all local databases */
 385         for (db=0; db<dbmap->num;db++) {
 386                 TDB_DATA data;
 387                 struct ctdb_db_priority db_prio;
 388                 int ret;
 389
 390                 db_prio.db_id     = dbmap->dbs[db].dbid;
 391                 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
 392                 if (ret != 0) {
 393                         DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
 394                         continue;
 395                 }
 396
 397                 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
 398
 399                 data.dptr  = (uint8_t *)&db_prio;
 400                 data.dsize = sizeof(db_prio);
 401
 402                 if (ctdb_client_async_control(ctdb,
 403                                         CTDB_CONTROL_SET_DB_PRIORITY,
 404                                         nodes, 0,
 405                                         CONTROL_TIMEOUT(), false, data,
 406                                         NULL, NULL,
 407                                         NULL) != 0) {
 408                         DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
 409                 }
 410         }
 411
 412         return 0;
 413 }
 414
 415 /*
 416   ensure all other nodes have attached to any databases that we have
 417  */
 418 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 419                                            uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 420 {
 421         int i, j, db, ret;
 422         struct ctdb_dbid_map *remote_dbmap;
 423
 424         /* verify that all other nodes have all our databases */
 425         for (j=0; j<nodemap->num; j++) {
 426                 /* we dont need to ourself ourselves */
 427                 if (nodemap->nodes[j].pnn == pnn) {
 428                         continue;
 429                 }
 430                 /* dont check nodes that are unavailable */
 431                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 432                         continue;
 433                 }
 434
 435                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 436                                          mem_ctx, &remote_dbmap);
 437                 if (ret != 0) {
 438                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 439                         return -1;
 440                 }
 441
 442                 /* step through all local databases */
 443                 for (db=0; db<dbmap->num;db++) {
 444                         const char *name;
 445
 446
 447                         for (i=0;i<remote_dbmap->num;i++) {
 448                                 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
 449                                         break;
 450                                 }
 451                         }
 452                         /* the remote node already have this database */
 453                         if (i!=remote_dbmap->num) {
 454                                 continue;
 455                         }
 456                         /* ok so we need to create this database */
 457                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid,
 458                                             mem_ctx, &name);
 459                         if (ret != 0) {
 460                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 461                                 return -1;
 462                         }
 463                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 464                                            mem_ctx, name,
 465                                            dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 466                         if (ret != 0) {
 467                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 468                                 return -1;
 469                         }
 470                 }
 471         }
 472
 473         return 0;
 474 }
 475
 476
 477 /*
 478   ensure we are attached to any databases that anyone else is attached to
 479  */
 480 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 481                                           uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
 482 {
 483         int i, j, db, ret;
 484         struct ctdb_dbid_map *remote_dbmap;
 485
 486         /* verify that we have all database any other node has */
 487         for (j=0; j<nodemap->num; j++) {
 488                 /* we dont need to ourself ourselves */
 489                 if (nodemap->nodes[j].pnn == pnn) {
 490                         continue;
 491                 }
 492                 /* dont check nodes that are unavailable */
 493                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 494                         continue;
 495                 }
 496
 497                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 498                                          mem_ctx, &remote_dbmap);
 499                 if (ret != 0) {
 500                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 501                         return -1;
 502                 }
 503
 504                 /* step through all databases on the remote node */
 505                 for (db=0; db<remote_dbmap->num;db++) {
 506                         const char *name;
 507
 508                         for (i=0;i<(*dbmap)->num;i++) {
 509                                 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
 510                                         break;
 511                                 }
 512                         }
 513                         /* we already have this db locally */
 514                         if (i!=(*dbmap)->num) {
 515                                 continue;
 516                         }
 517                         /* ok so we need to create this database and
 518                            rebuild dbmap
 519                          */
 520                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 521                                             remote_dbmap->dbs[db].dbid, mem_ctx, &name);
 522                         if (ret != 0) {
 523                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 524                                           nodemap->nodes[j].pnn));
 525                                 return -1;
 526                         }
 527                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
 528                                            remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 529                         if (ret != 0) {
 530                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 531                                 return -1;
 532                         }
 533                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 534                         if (ret != 0) {
 535                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 536                                 return -1;
 537                         }
 538                 }
 539         }
 540
 541         return 0;
 542 }
 543
 544
 545 /*
 546   pull the remote database contents from one node into the recdb
 547  */
 548 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 549                                     struct tdb_wrap *recdb, uint32_t dbid)
 550 {
 551         int ret;
 552         TDB_DATA outdata;
 553         struct ctdb_marshall_buffer *reply;
 554         struct ctdb_rec_data *rec;
 555         int i;
 556         TALLOC_CTX *tmp_ctx = talloc_new(recdb);
 557
 558         ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
 559                                CONTROL_TIMEOUT(), &outdata);
 560         if (ret != 0) {
 561                 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
 562                 talloc_free(tmp_ctx);
 563                 return -1;
 564         }
 565
 566         reply = (struct ctdb_marshall_buffer *)outdata.dptr;
 567
 568         if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
 569                 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
 570                 talloc_free(tmp_ctx);
 571                 return -1;
 572         }
 573
 574         rec = (struct ctdb_rec_data *)&reply->data[0];
 575
 576         for (i=0;
 577              i<reply->count;
 578              rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
 579                 TDB_DATA key, data;
 580                 struct ctdb_ltdb_header *hdr;
 581                 TDB_DATA existing;
 582
 583                 key.dptr = &rec->data[0];
 584                 key.dsize = rec->keylen;
 585                 data.dptr = &rec->data[key.dsize];
 586                 data.dsize = rec->datalen;
 587
 588                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 589
 590                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 591                         DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
 592                         talloc_free(tmp_ctx);
 593                         return -1;
 594                 }
 595
 596                 /* fetch the existing record, if any */
 597                 existing = tdb_fetch(recdb->tdb, key);
 598
 599                 if (existing.dptr != NULL) {
 600                         struct ctdb_ltdb_header header;
 601                         if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
 602                                 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
 603                                          (unsigned)existing.dsize, srcnode));
 604                                 free(existing.dptr);
 605                                 talloc_free(tmp_ctx);
 606                                 return -1;
 607                         }
 608                         header = *(struct ctdb_ltdb_header *)existing.dptr;
 609                         free(existing.dptr);
 610                         if (!(header.rsn < hdr->rsn ||
 611                               (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
 612                                 continue;
 613                         }
 614                 }
 615
 616                 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
 617                         DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
 618                         talloc_free(tmp_ctx);
 619                         return -1;
 620                 }
 621         }
 622
 623         talloc_free(tmp_ctx);
 624
 625         return 0;
 626 }
 627
 628
 629 struct pull_seqnum_cbdata {
 630         int failed;
 631         uint32_t pnn;
 632         uint64_t seqnum;
 633 };
 634
 635 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 636 {
 637         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 638         uint64_t seqnum;
 639
 640         if (cb_data->failed != 0) {
 641                 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
 642                 return;
 643         }
 644
 645         if (res != 0) {
 646                 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
 647                 cb_data->failed = 1;
 648                 return;
 649         }
 650
 651         if (outdata.dsize != sizeof(uint64_t)) {
 652                 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
 653                 cb_data->failed = -1;
 654                 return;
 655         }
 656
 657         seqnum = *((uint64_t *)outdata.dptr);
 658
 659         if (seqnum > cb_data->seqnum) {
 660                 cb_data->seqnum = seqnum;
 661                 cb_data->pnn = node_pnn;
 662         }
 663 }
 664
 665 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 666 {
 667         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 668
 669         DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
 670         cb_data->failed = 1;
 671 }
 672
 673 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
 674                                 struct ctdb_recoverd *rec,
 675                                 struct ctdb_node_map *nodemap,
 676                                 struct tdb_wrap *recdb, uint32_t dbid)
 677 {
 678         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
 679         uint32_t *nodes;
 680         TDB_DATA data;
 681         uint32_t outdata[2];
 682         struct pull_seqnum_cbdata *cb_data;
 683
 684         DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
 685
 686         outdata[0] = dbid;
 687         outdata[1] = 0;
 688
 689         data.dsize = sizeof(outdata);
 690         data.dptr  = (uint8_t *)&outdata[0];
 691
 692         cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
 693         if (cb_data == NULL) {
 694                 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
 695                 talloc_free(tmp_ctx);
 696                 return -1;
 697         }
 698
 699         cb_data->failed = 0;
 700         cb_data->pnn    = -1;
 701         cb_data->seqnum = 0;
 702
 703         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 704         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
 705                                         nodes, 0,
 706                                         CONTROL_TIMEOUT(), false, data,
 707                                         pull_seqnum_cb,
 708                                         pull_seqnum_fail_cb,
 709                                         cb_data) != 0) {
 710                 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
 711
 712                 talloc_free(tmp_ctx);
 713                 return -1;
 714         }
 715
 716         if (cb_data->failed != 0) {
 717                 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
 718                 talloc_free(tmp_ctx);
 719                 return -1;
 720         }
 721
 722         if (cb_data->seqnum == 0 || cb_data->pnn == -1) {
 723                 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
 724                 talloc_free(tmp_ctx);
 725                 return -1;
 726         }
 727
 728         DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
 729
 730         if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
 731                 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
 732                 talloc_free(tmp_ctx);
 733                 return -1;
 734         }
 735
 736         talloc_free(tmp_ctx);
 737         return 0;
 738 }
 739
 740
 741 /*
 742   pull all the remote database contents into the recdb
 743  */
 744 static int pull_remote_database(struct ctdb_context *ctdb,
 745                                 struct ctdb_recoverd *rec,
 746                                 struct ctdb_node_map *nodemap,
 747                                 struct tdb_wrap *recdb, uint32_t dbid,
 748                                 bool persistent)
 749 {
 750         int j;
 751
 752         if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
 753                 int ret;
 754                 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
 755                 if (ret == 0) {
 756                         return 0;
 757                 }
 758         }
 759
 760         /* pull all records from all other nodes across onto this node
 761            (this merges based on rsn)
 762         */
 763         for (j=0; j<nodemap->num; j++) {
 764                 /* dont merge from nodes that are unavailable */
 765                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 766                         continue;
 767                 }
 768                 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 769                         DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
 770                                  nodemap->nodes[j].pnn));
 771                         ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 772                         return -1;
 773                 }
 774         }
 775
 776         return 0;
 777 }
 778
 779
 780 /*
 781   update flags on all active nodes
 782  */
 783 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
 784 {
 785         int ret;
 786
 787         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 788                 if (ret != 0) {
 789                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 790                 return -1;
 791         }
 792
 793         return 0;
 794 }
 795
 796 /*
 797   ensure all nodes have the same vnnmap we do
 798  */
 799 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 800                                       uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
 801 {
 802         int j, ret;
 803
 804         /* push the new vnn map out to all the nodes */
 805         for (j=0; j<nodemap->num; j++) {
 806                 /* dont push to nodes that are unavailable */
 807                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 808                         continue;
 809                 }
 810
 811                 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
 812                 if (ret != 0) {
 813                         DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
 814                         return -1;
 815                 }
 816         }
 817
 818         return 0;
 819 }
 820
 821
 822 struct vacuum_info {
 823         struct vacuum_info *next, *prev;
 824         struct ctdb_recoverd *rec;
 825         uint32_t srcnode;
 826         struct ctdb_db_context *ctdb_db;
 827         struct ctdb_marshall_buffer *recs;
 828         struct ctdb_rec_data *r;
 829 };
 830
 831 static void vacuum_fetch_next(struct vacuum_info *v);
 832
 833 /*
 834   called when a vacuum fetch has completed - just free it and do the next one
 835  */
 836 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 837 {
 838         struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
 839         talloc_free(state);
 840         vacuum_fetch_next(v);
 841 }
 842
 843
 844 /*
 845   process the next element from the vacuum list
 846 */
 847 static void vacuum_fetch_next(struct vacuum_info *v)
 848 {
 849         struct ctdb_call call;
 850         struct ctdb_rec_data *r;
 851
 852         while (v->recs->count) {
 853                 struct ctdb_client_call_state *state;
 854                 TDB_DATA data;
 855                 struct ctdb_ltdb_header *hdr;
 856
 857                 ZERO_STRUCT(call);
 858                 call.call_id = CTDB_NULL_FUNC;
 859                 call.flags = CTDB_IMMEDIATE_MIGRATION;
 860                 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
 861
 862                 r = v->r;
 863                 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
 864                 v->recs->count--;
 865
 866                 call.key.dptr = &r->data[0];
 867                 call.key.dsize = r->keylen;
 868
 869                 /* ensure we don't block this daemon - just skip a record if we can't get
 870                    the chainlock */
 871                 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
 872                         continue;
 873                 }
 874
 875                 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
 876                 if (data.dptr == NULL) {
 877                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 878                         continue;
 879                 }
 880
 881                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 882                         free(data.dptr);
 883                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 884                         continue;
 885                 }
 886
 887                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 888                 if (hdr->dmaster == v->rec->ctdb->pnn) {
 889                         /* its already local */
 890                         free(data.dptr);
 891                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 892                         continue;
 893                 }
 894
 895                 free(data.dptr);
 896
 897                 state = ctdb_call_send(v->ctdb_db, &call);
 898                 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 899                 if (state == NULL) {
 900                         DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
 901                         talloc_free(v);
 902                         return;
 903                 }
 904                 state->async.fn = vacuum_fetch_callback;
 905                 state->async.private_data = v;
 906                 return;
 907         }
 908
 909         talloc_free(v);
 910 }
 911
 912
 913 /*
 914   destroy a vacuum info structure
 915  */
 916 static int vacuum_info_destructor(struct vacuum_info *v)
 917 {
 918         DLIST_REMOVE(v->rec->vacuum_info, v);
 919         return 0;
 920 }
 921
 922
 923 /*
 924   handler for vacuum fetch
 925 */
 926 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
 927                                  TDB_DATA data, void *private_data)
 928 {
 929         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
 930         struct ctdb_marshall_buffer *recs;
 931         int ret, i;
 932         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
 933         const char *name;
 934         struct ctdb_dbid_map *dbmap=NULL;
 935         bool persistent = false;
 936         struct ctdb_db_context *ctdb_db;
 937         struct ctdb_rec_data *r;
 938         uint32_t srcnode;
 939         struct vacuum_info *v;
 940
 941         recs = (struct ctdb_marshall_buffer *)data.dptr;
 942         r = (struct ctdb_rec_data *)&recs->data[0];
 943
 944         if (recs->count == 0) {
 945                 talloc_free(tmp_ctx);
 946                 return;
 947         }
 948
 949         srcnode = r->reqid;
 950
 951         for (v=rec->vacuum_info;v;v=v->next) {
 952                 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
 953                         /* we're already working on records from this node */
 954                         talloc_free(tmp_ctx);
 955                         return;
 956                 }
 957         }
 958
 959         /* work out if the database is persistent */
 960         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
 961         if (ret != 0) {
 962                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
 963                 talloc_free(tmp_ctx);
 964                 return;
 965         }
 966
 967         for (i=0;i<dbmap->num;i++) {
 968                 if (dbmap->dbs[i].dbid == recs->db_id) {
 969                         persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
 970                         break;
 971                 }
 972         }
 973         if (i == dbmap->num) {
 974                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
 975                 talloc_free(tmp_ctx);
 976                 return;
 977         }
 978
 979         /* find the name of this database */
 980         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
 981                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
 982                 talloc_free(tmp_ctx);
 983                 return;
 984         }
 985
 986         /* attach to it */
 987         ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
 988         if (ctdb_db == NULL) {
 989                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
 990                 talloc_free(tmp_ctx);
 991                 return;
 992         }
 993
 994         v = talloc_zero(rec, struct vacuum_info);
 995         if (v == NULL) {
 996                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
 997                 talloc_free(tmp_ctx);
 998                 return;
 999         }
1000
1001         v->rec = rec;
1002         v->srcnode = srcnode;
1003         v->ctdb_db = ctdb_db;
1004         v->recs = talloc_memdup(v, recs, data.dsize);
1005         if (v->recs == NULL) {
1006                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1007                 talloc_free(v);
1008                 talloc_free(tmp_ctx);
1009                 return;
1010         }
1011         v->r =  (struct ctdb_rec_data *)&v->recs->data[0];
1012
1013         DLIST_ADD(rec->vacuum_info, v);
1014
1015         talloc_set_destructor(v, vacuum_info_destructor);
1016
1017         vacuum_fetch_next(v);
1018         talloc_free(tmp_ctx);
1019 }
1020
1021
1022 /*
1023   called when ctdb_wait_timeout should finish
1024  */
1025 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1026                               struct timeval yt, void *p)
1027 {
1028         uint32_t *timed_out = (uint32_t *)p;
1029         (*timed_out) = 1;
1030 }
1031
1032 /*
1033   wait for a given number of seconds
1034  */
1035 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1036 {
1037         uint32_t timed_out = 0;
1038         time_t usecs = (secs - (time_t)secs) * 1000000;
1039         event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1040         while (!timed_out) {
1041                 event_loop_once(ctdb->ev);
1042         }
1043 }
1044
1045 /*
1046   called when an election times out (ends)
1047  */
1048 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1049                                   struct timeval t, void *p)
1050 {
1051         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1052         rec->election_timeout = NULL;
1053         fast_start = false;
1054
1055         DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1056 }
1057
1058
1059 /*
1060   wait for an election to finish. It finished election_timeout seconds after
1061   the last election packet is received
1062  */
1063 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1064 {
1065         struct ctdb_context *ctdb = rec->ctdb;
1066         while (rec->election_timeout) {
1067                 event_loop_once(ctdb->ev);
1068         }
1069 }
1070
1071 /*
1072   Update our local flags from all remote connected nodes.
1073   This is only run when we are or we belive we are the recovery master
1074  */
1075 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1076 {
1077         int j;
1078         struct ctdb_context *ctdb = rec->ctdb;
1079         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1080
1081         /* get the nodemap for all active remote nodes and verify
1082            they are the same as for this node
1083          */
1084         for (j=0; j<nodemap->num; j++) {
1085                 struct ctdb_node_map *remote_nodemap=NULL;
1086                 int ret;
1087
1088                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1089                         continue;
1090                 }
1091                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1092                         continue;
1093                 }
1094
1095                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1096                                            mem_ctx, &remote_nodemap);
1097                 if (ret != 0) {
1098                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1099                                   nodemap->nodes[j].pnn));
1100                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1101                         talloc_free(mem_ctx);
1102                         return MONITOR_FAILED;
1103                 }
1104                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1105                         /* We should tell our daemon about this so it
1106                            updates its flags or else we will log the same
1107                            message again in the next iteration of recovery.
1108                            Since we are the recovery master we can just as
1109                            well update the flags on all nodes.
1110                         */
1111                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, nodemap->nodes[j].flags, ~nodemap->nodes[j].flags);
1112                         if (ret != 0) {
1113                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1114                                 return -1;
1115                         }
1116
1117                         /* Update our local copy of the flags in the recovery
1118                            daemon.
1119                         */
1120                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1121                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1122                                  nodemap->nodes[j].flags));
1123                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1124                 }
1125                 talloc_free(remote_nodemap);
1126         }
1127         talloc_free(mem_ctx);
1128         return MONITOR_OK;
1129 }
1130
1131
1132 /* Create a new random generation ip.
1133    The generation id can not be the INVALID_GENERATION id
1134 */
1135 static uint32_t new_generation(void)
1136 {
1137         uint32_t generation;
1138
1139         while (1) {
1140                 generation = random();
1141
1142                 if (generation != INVALID_GENERATION) {
1143                         break;
1144                 }
1145         }
1146
1147         return generation;
1148 }
1149
1150
1151 /*
1152   create a temporary working database
1153  */
1154 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1155 {
1156         char *name;
1157         struct tdb_wrap *recdb;
1158         unsigned tdb_flags;
1159
1160         /* open up the temporary recovery database */
1161         name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1162                                ctdb->db_directory_state,
1163                                ctdb->pnn);
1164         if (name == NULL) {
1165                 return NULL;
1166         }
1167         unlink(name);
1168
1169         tdb_flags = TDB_NOLOCK;
1170         if (ctdb->valgrinding) {
1171                 tdb_flags |= TDB_NOMMAP;
1172         }
1173         tdb_flags |= TDB_DISALLOW_NESTING;
1174
1175         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1176                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1177         if (recdb == NULL) {
1178                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1179         }
1180
1181         talloc_free(name);
1182
1183         return recdb;
1184 }
1185
1186
1187 /*
1188    a traverse function for pulling all relevant records from recdb
1189  */
1190 struct recdb_data {
1191         struct ctdb_context *ctdb;
1192         struct ctdb_marshall_buffer *recdata;
1193         uint32_t len;
1194         uint32_t allocated_len;
1195         bool failed;
1196         bool persistent;
1197 };
1198
1199 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1200 {
1201         struct recdb_data *params = (struct recdb_data *)p;
1202         struct ctdb_rec_data *rec;
1203         struct ctdb_ltdb_header *hdr;
1204
1205         /* skip empty records */
1206         if (data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1207                 return 0;
1208         }
1209
1210         /* update the dmaster field to point to us */
1211         hdr = (struct ctdb_ltdb_header *)data.dptr;
1212         if (!params->persistent) {
1213                 hdr->dmaster = params->ctdb->pnn;
1214                 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1215         }
1216
1217         /* add the record to the blob ready to send to the nodes */
1218         rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1219         if (rec == NULL) {
1220                 params->failed = true;
1221                 return -1;
1222         }
1223         if (params->len + rec->length >= params->allocated_len) {
1224                 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1225                 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1226         }
1227         if (params->recdata == NULL) {
1228                 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u (%u records)\n",
1229                          rec->length + params->len, params->recdata->count));
1230                 params->failed = true;
1231                 return -1;
1232         }
1233         params->recdata->count++;
1234         memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1235         params->len += rec->length;
1236         talloc_free(rec);
1237
1238         return 0;
1239 }
1240
1241 /*
1242   push the recdb database out to all nodes
1243  */
1244 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1245                                bool persistent,
1246                                struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1247 {
1248         struct recdb_data params;
1249         struct ctdb_marshall_buffer *recdata;
1250         TDB_DATA outdata;
1251         TALLOC_CTX *tmp_ctx;
1252         uint32_t *nodes;
1253
1254         tmp_ctx = talloc_new(ctdb);
1255         CTDB_NO_MEMORY(ctdb, tmp_ctx);
1256
1257         recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1258         CTDB_NO_MEMORY(ctdb, recdata);
1259
1260         recdata->db_id = dbid;
1261
1262         params.ctdb = ctdb;
1263         params.recdata = recdata;
1264         params.len = offsetof(struct ctdb_marshall_buffer, data);
1265         params.allocated_len = params.len;
1266         params.failed = false;
1267         params.persistent = persistent;
1268
1269         if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1270                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1271                 talloc_free(params.recdata);
1272                 talloc_free(tmp_ctx);
1273                 return -1;
1274         }
1275
1276         if (params.failed) {
1277                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1278                 talloc_free(params.recdata);
1279                 talloc_free(tmp_ctx);
1280                 return -1;
1281         }
1282
1283         recdata = params.recdata;
1284
1285         outdata.dptr = (void *)recdata;
1286         outdata.dsize = params.len;
1287
1288         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1289         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1290                                         nodes, 0,
1291                                         CONTROL_TIMEOUT(), false, outdata,
1292                                         NULL, NULL,
1293                                         NULL) != 0) {
1294                 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1295                 talloc_free(recdata);
1296                 talloc_free(tmp_ctx);
1297                 return -1;
1298         }
1299
1300         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1301                   dbid, recdata->count));
1302
1303         talloc_free(recdata);
1304         talloc_free(tmp_ctx);
1305
1306         return 0;
1307 }
1308
1309
1310 /*
1311   go through a full recovery on one database
1312  */
1313 static int recover_database(struct ctdb_recoverd *rec,
1314                             TALLOC_CTX *mem_ctx,
1315                             uint32_t dbid,
1316                             bool persistent,
1317                             uint32_t pnn,
1318                             struct ctdb_node_map *nodemap,
1319                             uint32_t transaction_id)
1320 {
1321         struct tdb_wrap *recdb;
1322         int ret;
1323         struct ctdb_context *ctdb = rec->ctdb;
1324         TDB_DATA data;
1325         struct ctdb_control_wipe_database w;
1326         uint32_t *nodes;
1327
1328         recdb = create_recdb(ctdb, mem_ctx);
1329         if (recdb == NULL) {
1330                 return -1;
1331         }
1332
1333         /* pull all remote databases onto the recdb */
1334         ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1335         if (ret != 0) {
1336                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1337                 return -1;
1338         }
1339
1340         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1341
1342         /* wipe all the remote databases. This is safe as we are in a transaction */
1343         w.db_id = dbid;
1344         w.transaction_id = transaction_id;
1345
1346         data.dptr = (void *)&w;
1347         data.dsize = sizeof(w);
1348
1349         nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1350         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1351                                         nodes, 0,
1352                                         CONTROL_TIMEOUT(), false, data,
1353                                         NULL, NULL,
1354                                         NULL) != 0) {
1355                 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1356                 talloc_free(recdb);
1357                 return -1;
1358         }
1359
1360         /* push out the correct database. This sets the dmaster and skips
1361            the empty records */
1362         ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1363         if (ret != 0) {
1364                 talloc_free(recdb);
1365                 return -1;
1366         }
1367
1368         /* all done with this database */
1369         talloc_free(recdb);
1370
1371         return 0;
1372 }
1373
1374 /*
1375   reload the nodes file
1376 */
1377 static void reload_nodes_file(struct ctdb_context *ctdb)
1378 {
1379         ctdb->nodes = NULL;
1380         ctdb_load_nodes_file(ctdb);
1381 }
1382
1383 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1384                                          struct ctdb_recoverd *rec,
1385                                          struct ctdb_node_map *nodemap,
1386                                          uint32_t *culprit)
1387 {
1388         int j;
1389         int ret;
1390
1391         if (ctdb->num_nodes != nodemap->num) {
1392                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1393                                   ctdb->num_nodes, nodemap->num));
1394                 if (culprit) {
1395                         *culprit = ctdb->pnn;
1396                 }
1397                 return -1;
1398         }
1399
1400         for (j=0; j<nodemap->num; j++) {
1401                 /* release any existing data */
1402                 if (ctdb->nodes[j]->known_public_ips) {
1403                         talloc_free(ctdb->nodes[j]->known_public_ips);
1404                         ctdb->nodes[j]->known_public_ips = NULL;
1405                 }
1406                 if (ctdb->nodes[j]->available_public_ips) {
1407                         talloc_free(ctdb->nodes[j]->available_public_ips);
1408                         ctdb->nodes[j]->available_public_ips = NULL;
1409                 }
1410
1411                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1412                         continue;
1413                 }
1414
1415                 /* grab a new shiny list of public ips from the node */
1416                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1417                                         CONTROL_TIMEOUT(),
1418                                         ctdb->nodes[j]->pnn,
1419                                         ctdb->nodes,
1420                                         0,
1421                                         &ctdb->nodes[j]->known_public_ips);
1422                 if (ret != 0) {
1423                         DEBUG(DEBUG_ERR,("Failed to read known public ips from node : %u\n",
1424                                 ctdb->nodes[j]->pnn));
1425                         if (culprit) {
1426                                 *culprit = ctdb->nodes[j]->pnn;
1427                         }
1428                         return -1;
1429                 }
1430
1431                 if (ctdb->do_checkpublicip) {
1432                         if (rec->ip_check_disable_ctx == NULL) {
1433                                 if (verify_remote_ip_allocation(ctdb, ctdb->nodes[j]->known_public_ips)) {
1434                                         DEBUG(DEBUG_ERR,("Node %d has inconsistent public ip allocation and needs update.\n", ctdb->nodes[j]->pnn));
1435                                         rec->need_takeover_run = true;
1436                                 }
1437                         }
1438                 }
1439
1440                 /* grab a new shiny list of public ips from the node */
1441                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1442                                         CONTROL_TIMEOUT(),
1443                                         ctdb->nodes[j]->pnn,
1444                                         ctdb->nodes,
1445                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1446                                         &ctdb->nodes[j]->available_public_ips);
1447                 if (ret != 0) {
1448                         DEBUG(DEBUG_ERR,("Failed to read available public ips from node : %u\n",
1449                                 ctdb->nodes[j]->pnn));
1450                         if (culprit) {
1451                                 *culprit = ctdb->nodes[j]->pnn;
1452                         }
1453                         return -1;
1454                 }
1455         }
1456
1457         return 0;
1458 }
1459
1460 /* when we start a recovery, make sure all nodes use the same reclock file
1461    setting
1462 */
1463 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1464 {
1465         struct ctdb_context *ctdb = rec->ctdb;
1466         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1467         TDB_DATA data;
1468         uint32_t *nodes;
1469
1470         if (ctdb->recovery_lock_file == NULL) {
1471                 data.dptr  = NULL;
1472                 data.dsize = 0;
1473         } else {
1474                 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1475                 data.dptr  = (uint8_t *)ctdb->recovery_lock_file;
1476         }
1477
1478         nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1479         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1480                                         nodes, 0,
1481                                         CONTROL_TIMEOUT(),
1482                                         false, data,
1483                                         NULL, NULL,
1484                                         rec) != 0) {
1485                 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1486                 talloc_free(tmp_ctx);
1487                 return -1;
1488         }
1489
1490         talloc_free(tmp_ctx);
1491         return 0;
1492 }
1493
1494
1495 /*
1496  * this callback is called for every node that failed to execute ctdb_takeover_run()
1497  * and set flag to re-run takeover run.
1498  */
1499 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1500 {
1501         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1502
1503         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the takeover run. Setting it as recovery fail culprit\n", node_pnn));
1504
1505         ctdb_set_culprit(rec, node_pnn);
1506         rec->need_takeover_run = true;
1507 }
1508
1509
1510 /*
1511   we are the recmaster, and recovery is needed - start a recovery run
1512  */
1513 static int do_recovery(struct ctdb_recoverd *rec,
1514                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1515                        struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1516 {
1517         struct ctdb_context *ctdb = rec->ctdb;
1518         int i, j, ret;
1519         uint32_t generation;
1520         struct ctdb_dbid_map *dbmap;
1521         TDB_DATA data;
1522         uint32_t *nodes;
1523         struct timeval start_time;
1524         uint32_t culprit = (uint32_t)-1;
1525
1526         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1527
1528         /* if recovery fails, force it again */
1529         rec->need_recovery = true;
1530
1531         for (i=0; i<ctdb->num_nodes; i++) {
1532                 struct ctdb_banning_state *ban_state;
1533
1534                 if (ctdb->nodes[i]->ban_state == NULL) {
1535                         continue;
1536                 }
1537                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1538                 if (ban_state->count < 2*ctdb->num_nodes) {
1539                         continue;
1540                 }
1541                 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
1542                         ctdb->nodes[i]->pnn, ban_state->count,
1543                         ctdb->tunable.recovery_ban_period));
1544                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1545                 ban_state->count = 0;
1546         }
1547
1548
1549         if (ctdb->tunable.verify_recovery_lock != 0) {
1550                 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1551                 start_time = timeval_current();
1552                 if (!ctdb_recovery_lock(ctdb, true)) {
1553                         DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1554                                          "and ban ourself for %u seconds\n",
1555                                          ctdb->tunable.recovery_ban_period));
1556                         ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1557                         return -1;
1558                 }
1559                 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1560                 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1561         }
1562
1563         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1564
1565         /* get a list of all databases */
1566         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1567         if (ret != 0) {
1568                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1569                 return -1;
1570         }
1571
1572         /* we do the db creation before we set the recovery mode, so the freeze happens
1573            on all databases we will be dealing with. */
1574
1575         /* verify that we have all the databases any other node has */
1576         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1577         if (ret != 0) {
1578                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1579                 return -1;
1580         }
1581
1582         /* verify that all other nodes have all our databases */
1583         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1584         if (ret != 0) {
1585                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1586                 return -1;
1587         }
1588         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1589
1590         /* update the database priority for all remote databases */
1591         ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1592         if (ret != 0) {
1593                 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1594         }
1595         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1596
1597
1598         /* update all other nodes to use the same setting for reclock files
1599            as the local recovery master.
1600         */
1601         sync_recovery_lock_file_across_cluster(rec);
1602
1603         /* set recovery mode to active on all nodes */
1604         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1605         if (ret != 0) {
1606                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1607                 return -1;
1608         }
1609
1610         /* execute the "startrecovery" event script on all nodes */
1611         ret = run_startrecovery_eventscript(rec, nodemap);
1612         if (ret!=0) {
1613                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1614                 return -1;
1615         }
1616
1617         /*
1618           update all nodes to have the same flags that we have
1619          */
1620         for (i=0;i<nodemap->num;i++) {
1621                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1622                         continue;
1623                 }
1624
1625                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1626                 if (ret != 0) {
1627                         DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1628                         return -1;
1629                 }
1630         }
1631
1632         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1633
1634         /* pick a new generation number */
1635         generation = new_generation();
1636
1637         /* change the vnnmap on this node to use the new generation
1638            number but not on any other nodes.
1639            this guarantees that if we abort the recovery prematurely
1640            for some reason (a node stops responding?)
1641            that we can just return immediately and we will reenter
1642            recovery shortly again.
1643            I.e. we deliberately leave the cluster with an inconsistent
1644            generation id to allow us to abort recovery at any stage and
1645            just restart it from scratch.
1646          */
1647         vnnmap->generation = generation;
1648         ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1649         if (ret != 0) {
1650                 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1651                 return -1;
1652         }
1653
1654         data.dptr = (void *)&generation;
1655         data.dsize = sizeof(uint32_t);
1656
1657         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1658         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1659                                         nodes, 0,
1660                                         CONTROL_TIMEOUT(), false, data,
1661                                         NULL,
1662                                         transaction_start_fail_callback,
1663                                         rec) != 0) {
1664                 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1665                 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1666                                         nodes, 0,
1667                                         CONTROL_TIMEOUT(), false, tdb_null,
1668                                         NULL,
1669                                         NULL,
1670                                         NULL) != 0) {
1671                         DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1672                 }
1673                 return -1;
1674         }
1675
1676         DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1677
1678         for (i=0;i<dbmap->num;i++) {
1679                 ret = recover_database(rec, mem_ctx,
1680                                        dbmap->dbs[i].dbid,
1681                                        dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1682                                        pnn, nodemap, generation);
1683                 if (ret != 0) {
1684                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1685                         return -1;
1686                 }
1687         }
1688
1689         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1690
1691         /* commit all the changes */
1692         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1693                                         nodes, 0,
1694                                         CONTROL_TIMEOUT(), false, data,
1695                                         NULL, NULL,
1696                                         NULL) != 0) {
1697                 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1698                 return -1;
1699         }
1700
1701         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1702
1703
1704         /* update the capabilities for all nodes */
1705         ret = update_capabilities(ctdb, nodemap);
1706         if (ret!=0) {
1707                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1708                 return -1;
1709         }
1710
1711         /* build a new vnn map with all the currently active and
1712            unbanned nodes */
1713         generation = new_generation();
1714         vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1715         CTDB_NO_MEMORY(ctdb, vnnmap);
1716         vnnmap->generation = generation;
1717         vnnmap->size = 0;
1718         vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1719         CTDB_NO_MEMORY(ctdb, vnnmap->map);
1720         for (i=j=0;i<nodemap->num;i++) {
1721                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1722                         continue;
1723                 }
1724                 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1725                         /* this node can not be an lmaster */
1726                         DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1727                         continue;
1728                 }
1729
1730                 vnnmap->size++;
1731                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1732                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1733                 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1734
1735         }
1736         if (vnnmap->size == 0) {
1737                 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1738                 vnnmap->size++;
1739                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1740                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1741                 vnnmap->map[0] = pnn;
1742         }
1743
1744         /* update to the new vnnmap on all nodes */
1745         ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1746         if (ret != 0) {
1747                 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1748                 return -1;
1749         }
1750
1751         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1752
1753         /* update recmaster to point to us for all nodes */
1754         ret = set_recovery_master(ctdb, nodemap, pnn);
1755         if (ret!=0) {
1756                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1757                 return -1;
1758         }
1759
1760         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1761
1762         /*
1763           update all nodes to have the same flags that we have
1764          */
1765         for (i=0;i<nodemap->num;i++) {
1766                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1767                         continue;
1768                 }
1769
1770                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1771                 if (ret != 0) {
1772                         DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1773                         return -1;
1774                 }
1775         }
1776
1777         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1778
1779         /* disable recovery mode */
1780         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1781         if (ret != 0) {
1782                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1783                 return -1;
1784         }
1785
1786         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1787
1788         /*
1789           tell nodes to takeover their public IPs
1790          */
1791         ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1792         if (ret != 0) {
1793                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1794                                  culprit));
1795                 rec->need_takeover_run = true;
1796                 return -1;
1797         }
1798         rec->need_takeover_run = false;
1799         ret = ctdb_takeover_run(ctdb, nodemap, NULL, NULL);
1800         if (ret != 0) {
1801                 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
1802                 rec->need_takeover_run = true;
1803         }
1804
1805         /* execute the "recovered" event script on all nodes */
1806         ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
1807         if (ret!=0) {
1808                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1809                 return -1;
1810         }
1811
1812         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1813
1814         /* send a message to all clients telling them that the cluster
1815            has been reconfigured */
1816         ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
1817
1818         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1819
1820         rec->need_recovery = false;
1821
1822         /* we managed to complete a full recovery, make sure to forgive
1823            any past sins by the nodes that could now participate in the
1824            recovery.
1825         */
1826         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1827         for (i=0;i<nodemap->num;i++) {
1828                 struct ctdb_banning_state *ban_state;
1829
1830                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1831                         continue;
1832                 }
1833
1834                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1835                 if (ban_state == NULL) {
1836                         continue;
1837                 }
1838
1839                 ban_state->count = 0;
1840         }
1841
1842
1843         /* We just finished a recovery successfully.
1844            We now wait for rerecovery_timeout before we allow
1845            another recovery to take place.
1846         */
1847         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1848         ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
1849         DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
1850
1851         return 0;
1852 }
1853
1854
1855 /*
1856   elections are won by first checking the number of connected nodes, then
1857   the priority time, then the pnn
1858  */
1859 struct election_message {
1860         uint32_t num_connected;
1861         struct timeval priority_time;
1862         uint32_t pnn;
1863         uint32_t node_flags;
1864 };
1865
1866 /*
1867   form this nodes election data
1868  */
1869 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1870 {
1871         int ret, i;
1872         struct ctdb_node_map *nodemap;
1873         struct ctdb_context *ctdb = rec->ctdb;
1874
1875         ZERO_STRUCTP(em);
1876
1877         em->pnn = rec->ctdb->pnn;
1878         em->priority_time = rec->priority_time;
1879
1880         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1881         if (ret != 0) {
1882                 DEBUG(DEBUG_ERR,(__location__ " unable to get election data\n"));
1883                 return;
1884         }
1885
1886         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1887         em->node_flags = rec->node_flags;
1888
1889         for (i=0;i<nodemap->num;i++) {
1890                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1891                         em->num_connected++;
1892                 }
1893         }
1894
1895         /* we shouldnt try to win this election if we cant be a recmaster */
1896         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1897                 em->num_connected = 0;
1898                 em->priority_time = timeval_current();
1899         }
1900
1901         talloc_free(nodemap);
1902 }
1903
1904 /*
1905   see if the given election data wins
1906  */
1907 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1908 {
1909         struct election_message myem;
1910         int cmp = 0;
1911
1912         ctdb_election_data(rec, &myem);
1913
1914         /* we cant win if we dont have the recmaster capability */
1915         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1916                 return false;
1917         }
1918
1919         /* we cant win if we are banned */
1920         if (rec->node_flags & NODE_FLAGS_BANNED) {
1921                 return false;
1922         }
1923
1924         /* we cant win if we are stopped */
1925         if (rec->node_flags & NODE_FLAGS_STOPPED) {
1926                 return false;
1927         }
1928
1929         /* we will automatically win if the other node is banned */
1930         if (em->node_flags & NODE_FLAGS_BANNED) {
1931                 return true;
1932         }
1933
1934         /* we will automatically win if the other node is banned */
1935         if (em->node_flags & NODE_FLAGS_STOPPED) {
1936                 return true;
1937         }
1938
1939         /* try to use the most connected node */
1940         if (cmp == 0) {
1941                 cmp = (int)myem.num_connected - (int)em->num_connected;
1942         }
1943
1944         /* then the longest running node */
1945         if (cmp == 0) {
1946                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1947         }
1948
1949         if (cmp == 0) {
1950                 cmp = (int)myem.pnn - (int)em->pnn;
1951         }
1952
1953         return cmp > 0;
1954 }
1955
1956 /*
1957   send out an election request
1958  */
1959 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn, bool update_recmaster)
1960 {
1961         int ret;
1962         TDB_DATA election_data;
1963         struct election_message emsg;
1964         uint64_t srvid;
1965         struct ctdb_context *ctdb = rec->ctdb;
1966
1967         srvid = CTDB_SRVID_RECOVERY;
1968
1969         ctdb_election_data(rec, &emsg);
1970
1971         election_data.dsize = sizeof(struct election_message);
1972         election_data.dptr  = (unsigned char *)&emsg;
1973
1974
1975         /* send an election message to all active nodes */
1976         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1977         ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1978
1979
1980         /* A new node that is already frozen has entered the cluster.
1981            The existing nodes are not frozen and dont need to be frozen
1982            until the election has ended and we start the actual recovery
1983         */
1984         if (update_recmaster == true) {
1985                 /* first we assume we will win the election and set
1986                    recoverymaster to be ourself on the current node
1987                  */
1988                 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
1989                 if (ret != 0) {
1990                         DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
1991                         return -1;
1992                 }
1993         }
1994
1995
1996         return 0;
1997 }
1998
1999 /*
2000   this function will unban all nodes in the cluster
2001 */
2002 static void unban_all_nodes(struct ctdb_context *ctdb)
2003 {
2004         int ret, i;
2005         struct ctdb_node_map *nodemap;
2006         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2007
2008         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2009         if (ret != 0) {
2010                 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2011                 return;
2012         }
2013
2014         for (i=0;i<nodemap->num;i++) {
2015                 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2016                   && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2017                         ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
2018                 }
2019         }
2020
2021         talloc_free(tmp_ctx);
2022 }
2023
2024
2025 /*
2026   we think we are winning the election - send a broadcast election request
2027  */
2028 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2029 {
2030         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2031         int ret;
2032
2033         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb), false);
2034         if (ret != 0) {
2035                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2036         }
2037
2038         talloc_free(rec->send_election_te);
2039         rec->send_election_te = NULL;
2040 }
2041
2042 /*
2043   handler for memory dumps
2044 */
2045 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2046                              TDB_DATA data, void *private_data)
2047 {
2048         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2049         TDB_DATA *dump;
2050         int ret;
2051         struct rd_memdump_reply *rd;
2052
2053         if (data.dsize != sizeof(struct rd_memdump_reply)) {
2054                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2055                 talloc_free(tmp_ctx);
2056                 return;
2057         }
2058         rd = (struct rd_memdump_reply *)data.dptr;
2059
2060         dump = talloc_zero(tmp_ctx, TDB_DATA);
2061         if (dump == NULL) {
2062                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2063                 talloc_free(tmp_ctx);
2064                 return;
2065         }
2066         ret = ctdb_dump_memory(ctdb, dump);
2067         if (ret != 0) {
2068                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2069                 talloc_free(tmp_ctx);
2070                 return;
2071         }
2072
2073 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2074
2075         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2076         if (ret != 0) {
2077                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2078                 talloc_free(tmp_ctx);
2079                 return;
2080         }
2081
2082         talloc_free(tmp_ctx);
2083 }
2084
2085 /*
2086   handler for getlog
2087 */
2088 static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2089                            TDB_DATA data, void *private_data)
2090 {
2091         struct ctdb_get_log_addr *log_addr;
2092         pid_t child;
2093
2094         if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
2095                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2096                 return;
2097         }
2098         log_addr = (struct ctdb_get_log_addr *)data.dptr;
2099
2100         child = ctdb_fork(ctdb);
2101         if (child == (pid_t)-1) {
2102                 DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
2103                 return;
2104         }
2105
2106         if (child == 0) {
2107                 if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
2108                         DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
2109                         _exit(1);
2110                 }
2111                 ctdb_collect_log(ctdb, log_addr);
2112                 _exit(0);
2113         }
2114 }
2115
2116 /*
2117   handler for clearlog
2118 */
2119 static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2120                              TDB_DATA data, void *private_data)
2121 {
2122         ctdb_clear_log(ctdb);
2123 }
2124
2125 /*
2126   handler for reload_nodes
2127 */
2128 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2129                              TDB_DATA data, void *private_data)
2130 {
2131         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2132
2133         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2134
2135         reload_nodes_file(rec->ctdb);
2136 }
2137
2138
2139 static void reenable_ip_check(struct event_context *ev, struct timed_event *te,
2140                               struct timeval yt, void *p)
2141 {
2142         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2143
2144         talloc_free(rec->ip_check_disable_ctx);
2145         rec->ip_check_disable_ctx = NULL;
2146 }
2147
2148
2149 static void ctdb_rebalance_timeout(struct event_context *ev, struct timed_event *te,
2150                                   struct timeval t, void *p)
2151 {
2152         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2153         struct ctdb_context *ctdb = rec->ctdb;
2154         int ret;
2155
2156         DEBUG(DEBUG_NOTICE,("Rebalance all nodes that have had ip assignment changes.\n"));
2157
2158         ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
2159         if (ret != 0) {
2160                 DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
2161                 rec->need_takeover_run = true;
2162         }
2163
2164         talloc_free(rec->deferred_rebalance_ctx);
2165         rec->deferred_rebalance_ctx = NULL;
2166 }
2167
2168
2169 static void recd_node_rebalance_handler(struct ctdb_context *ctdb, uint64_t srvid,
2170                              TDB_DATA data, void *private_data)
2171 {
2172         uint32_t pnn;
2173         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2174
2175         if (data.dsize != sizeof(uint32_t)) {
2176                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2177                 return;
2178         }
2179
2180         if (ctdb->tunable.deferred_rebalance_on_node_add == 0) {
2181                 return;
2182         }
2183
2184         pnn = *(uint32_t *)&data.dptr[0];
2185
2186         lcp2_forcerebalance(ctdb, pnn);
2187         DEBUG(DEBUG_NOTICE,("Received message to perform node rebalancing for node %d\n", pnn));
2188
2189         if (rec->deferred_rebalance_ctx != NULL) {
2190                 talloc_free(rec->deferred_rebalance_ctx);
2191         }
2192         rec->deferred_rebalance_ctx = talloc_new(rec);
2193         event_add_timed(ctdb->ev, rec->deferred_rebalance_ctx,
2194                         timeval_current_ofs(ctdb->tunable.deferred_rebalance_on_node_add, 0),
2195                         ctdb_rebalance_timeout, rec);
2196 }
2197
2198
2199
2200 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2201                              TDB_DATA data, void *private_data)
2202 {
2203         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2204         struct ctdb_public_ip *ip;
2205
2206         if (rec->recmaster != rec->ctdb->pnn) {
2207                 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2208                 return;
2209         }
2210
2211         if (data.dsize != sizeof(struct ctdb_public_ip)) {
2212                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2213                 return;
2214         }
2215
2216         ip = (struct ctdb_public_ip *)data.dptr;
2217
2218         update_ip_assignment_tree(rec->ctdb, ip);
2219 }
2220
2221
2222 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2223                              TDB_DATA data, void *private_data)
2224 {
2225         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2226         uint32_t timeout;
2227
2228         if (rec->ip_check_disable_ctx != NULL) {
2229                 talloc_free(rec->ip_check_disable_ctx);
2230                 rec->ip_check_disable_ctx = NULL;
2231         }
2232
2233         if (data.dsize != sizeof(uint32_t)) {
2234                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2235                                  "expexting %lu\n", (long unsigned)data.dsize,
2236                                  (long unsigned)sizeof(uint32_t)));
2237                 return;
2238         }
2239         if (data.dptr == NULL) {
2240                 DEBUG(DEBUG_ERR,(__location__ " No data recaived\n"));
2241                 return;
2242         }
2243
2244         timeout = *((uint32_t *)data.dptr);
2245
2246         if (timeout == 0) {
2247                 DEBUG(DEBUG_NOTICE,("Reenabling ip check\n"));
2248                 return;
2249         }
2250
2251         DEBUG(DEBUG_NOTICE,("Disabling ip check for %u seconds\n", timeout));
2252
2253         rec->ip_check_disable_ctx = talloc_new(rec);
2254         CTDB_NO_MEMORY_VOID(ctdb, rec->ip_check_disable_ctx);
2255
2256         event_add_timed(ctdb->ev, rec->ip_check_disable_ctx, timeval_current_ofs(timeout, 0), reenable_ip_check, rec);
2257 }
2258
2259
2260 /*
2261   handler for reload all ips.
2262 */
2263 static void ip_reloadall_handler(struct ctdb_context *ctdb, uint64_t srvid,
2264                              TDB_DATA data, void *private_data)
2265 {
2266         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2267
2268         if (data.dsize != sizeof(struct reloadips_all_reply)) {
2269                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2270                 return;
2271         }
2272
2273         reload_all_ips_request = (struct reloadips_all_reply *)talloc_steal(rec, data.dptr);
2274
2275         DEBUG(DEBUG_NOTICE,("RELOAD_ALL_IPS message received from node:%d srvid:%d\n", reload_all_ips_request->pnn, (int)reload_all_ips_request->srvid));
2276         return;
2277 }
2278
2279 static void async_reloadips_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2280 {
2281         uint32_t *status = callback_data;
2282
2283         if (res != 0) {
2284                 DEBUG(DEBUG_ERR,("Reload ips all failed on node %d\n", node_pnn));
2285                 *status = 1;
2286         }
2287 }
2288
2289 static int
2290 reload_all_ips(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, struct reloadips_all_reply *rips)
2291 {
2292         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2293         uint32_t *nodes;
2294         uint32_t status;
2295         int i;
2296
2297         DEBUG(DEBUG_ERR,("RELOAD ALL IPS on all active nodes\n"));
2298         for (i = 0; i< nodemap->num; i++) {
2299                 if (nodemap->nodes[i].flags != 0) {
2300                         DEBUG(DEBUG_ERR, ("Can not reload ips on all nodes. Node %d is not up and healthy\n", i));
2301                         talloc_free(tmp_ctx);
2302                         return -1;
2303                 }
2304         }
2305
2306         /* send the flags update to all connected nodes */
2307         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2308         status = 0;
2309         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RELOAD_PUBLIC_IPS,
2310                                         nodes, 0,
2311                                         CONTROL_TIMEOUT(),
2312                                         false, tdb_null,
2313                                         async_reloadips_callback, NULL,
2314                                         &status) != 0) {
2315                 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2316                 talloc_free(tmp_ctx);
2317                 return -1;
2318         }
2319
2320         if (status != 0) {
2321                 DEBUG(DEBUG_ERR, (__location__ " Failed to reloadips on all nodes.\n"));
2322                 talloc_free(tmp_ctx);
2323                 return -1;
2324         }
2325
2326         ctdb_client_send_message(ctdb, rips->pnn, rips->srvid, tdb_null);
2327
2328         talloc_free(tmp_ctx);
2329         return 0;
2330 }
2331
2332
2333 /*
2334   handler for ip reallocate, just add it to the list of callers and
2335   handle this later in the monitor_cluster loop so we do not recurse
2336   with other callers to takeover_run()
2337 */
2338 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2339                              TDB_DATA data, void *private_data)
2340 {
2341         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2342         struct ip_reallocate_list *caller;
2343
2344         if (data.dsize != sizeof(struct rd_memdump_reply)) {
2345                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2346                 return;
2347         }
2348
2349         if (rec->ip_reallocate_ctx == NULL) {
2350                 rec->ip_reallocate_ctx = talloc_new(rec);
2351                 CTDB_NO_MEMORY_FATAL(ctdb, rec->ip_reallocate_ctx);
2352         }
2353
2354         caller = talloc(rec->ip_reallocate_ctx, struct ip_reallocate_list);
2355         CTDB_NO_MEMORY_FATAL(ctdb, caller);
2356
2357         caller->rd   = (struct rd_memdump_reply *)talloc_steal(caller, data.dptr);
2358         caller->next = rec->reallocate_callers;
2359         rec->reallocate_callers = caller;
2360
2361         return;
2362 }
2363
2364 static void process_ipreallocate_requests(struct ctdb_context *ctdb, struct ctdb_recoverd *rec)
2365 {
2366         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2367         TDB_DATA result;
2368         int32_t ret;
2369         struct ip_reallocate_list *callers;
2370         uint32_t culprit;
2371
2372         DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2373
2374         /* update the list of public ips that a node can handle for
2375            all connected nodes
2376         */
2377         ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2378         if (ret != 0) {
2379                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2380                                  culprit));
2381                 rec->need_takeover_run = true;
2382         }
2383         if (ret == 0) {
2384                 ret = ctdb_takeover_run(ctdb, rec->nodemap, NULL, NULL);
2385                 if (ret != 0) {
2386                         DEBUG(DEBUG_ERR,("Failed to reallocate addresses: ctdb_takeover_run() failed.\n"));
2387                         rec->need_takeover_run = true;
2388                 }
2389         }
2390
2391         result.dsize = sizeof(int32_t);
2392         result.dptr  = (uint8_t *)&ret;
2393
2394         for (callers=rec->reallocate_callers; callers; callers=callers->next) {
2395
2396                 /* Someone that sent srvid==0 does not want a reply */
2397                 if (callers->rd->srvid == 0) {
2398                         continue;
2399                 }
2400                 DEBUG(DEBUG_INFO,("Sending ip reallocate reply message to "
2401                                   "%u:%llu\n", (unsigned)callers->rd->pnn,
2402                                   (unsigned long long)callers->rd->srvid));
2403                 ret = ctdb_client_send_message(ctdb, callers->rd->pnn, callers->rd->srvid, result);
2404                 if (ret != 0) {
2405                         DEBUG(DEBUG_ERR,("Failed to send ip reallocate reply "
2406                                          "message to %u:%llu\n",
2407                                          (unsigned)callers->rd->pnn,
2408                                          (unsigned long long)callers->rd->srvid));
2409                 }
2410         }
2411
2412         talloc_free(tmp_ctx);
2413         talloc_free(rec->ip_reallocate_ctx);
2414         rec->ip_reallocate_ctx = NULL;
2415         rec->reallocate_callers = NULL;
2416
2417 }
2418
2419
2420 /*
2421   handler for recovery master elections
2422 */
2423 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2424                              TDB_DATA data, void *private_data)
2425 {
2426         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2427         int ret;
2428         struct election_message *em = (struct election_message *)data.dptr;
2429         TALLOC_CTX *mem_ctx;
2430
2431         /* we got an election packet - update the timeout for the election */
2432         talloc_free(rec->election_timeout);
2433         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2434                                                 fast_start ?
2435                                                 timeval_current_ofs(0, 500000) :
2436                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2437                                                 ctdb_election_timeout, rec);
2438
2439         mem_ctx = talloc_new(ctdb);
2440
2441         /* someone called an election. check their election data
2442            and if we disagree and we would rather be the elected node,
2443            send a new election message to all other nodes
2444          */
2445         if (ctdb_election_win(rec, em)) {
2446                 if (!rec->send_election_te) {
2447                         rec->send_election_te = event_add_timed(ctdb->ev, rec,
2448                                                                 timeval_current_ofs(0, 500000),
2449                                                                 election_send_request, rec);
2450                 }
2451                 talloc_free(mem_ctx);
2452                 /*unban_all_nodes(ctdb);*/
2453                 return;
2454         }
2455
2456         /* we didn't win */
2457         talloc_free(rec->send_election_te);
2458         rec->send_election_te = NULL;
2459
2460         if (ctdb->tunable.verify_recovery_lock != 0) {
2461                 /* release the recmaster lock */
2462                 if (em->pnn != ctdb->pnn &&
2463                     ctdb->recovery_lock_fd != -1) {
2464                         close(ctdb->recovery_lock_fd);
2465                         ctdb->recovery_lock_fd = -1;
2466                         unban_all_nodes(ctdb);
2467                 }
2468         }
2469
2470         /* ok, let that guy become recmaster then */
2471         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2472         if (ret != 0) {
2473                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2474                 talloc_free(mem_ctx);
2475                 return;
2476         }
2477
2478         talloc_free(mem_ctx);
2479         return;
2480 }
2481
2482
2483 /*
2484   force the start of the election process
2485  */
2486 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2487                            struct ctdb_node_map *nodemap)
2488 {
2489         int ret;
2490         struct ctdb_context *ctdb = rec->ctdb;
2491
2492         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2493
2494         /* set all nodes to recovery mode to stop all internode traffic */
2495         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2496         if (ret != 0) {
2497                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2498                 return;
2499         }
2500
2501         talloc_free(rec->election_timeout);
2502         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2503                                                 fast_start ?
2504                                                 timeval_current_ofs(0, 500000) :
2505                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2506                                                 ctdb_election_timeout, rec);
2507
2508         ret = send_election_request(rec, pnn, true);
2509         if (ret!=0) {
2510                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2511                 return;
2512         }
2513
2514         /* wait for a few seconds to collect all responses */
2515         ctdb_wait_election(rec);
2516 }
2517
2518
2519
2520 /*
2521   handler for when a node changes its flags
2522 */
2523 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2524                             TDB_DATA data, void *private_data)
2525 {
2526         int ret;
2527         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2528         struct ctdb_node_map *nodemap=NULL;
2529         TALLOC_CTX *tmp_ctx;
2530         int i;
2531         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2532         int disabled_flag_changed;
2533
2534         if (data.dsize != sizeof(*c)) {
2535                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2536                 return;
2537         }
2538
2539         tmp_ctx = talloc_new(ctdb);
2540         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2541
2542         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2543         if (ret != 0) {
2544                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2545                 talloc_free(tmp_ctx);
2546                 return;
2547         }
2548
2549
2550         for (i=0;i<nodemap->num;i++) {
2551                 if (nodemap->nodes[i].pnn == c->pnn) break;
2552         }
2553
2554         if (i == nodemap->num) {
2555                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2556                 talloc_free(tmp_ctx);
2557                 return;
2558         }
2559
2560         if (nodemap->nodes[i].flags != c->new_flags) {
2561                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, nodemap->nodes[i].flags));
2562         }
2563
2564         disabled_flag_changed =  (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2565
2566         nodemap->nodes[i].flags = c->new_flags;
2567
2568         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2569                                      CTDB_CURRENT_NODE, &ctdb->recovery_master);
2570
2571         if (ret == 0) {
2572                 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2573                                            CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2574         }
2575
2576         if (ret == 0 &&
2577             ctdb->recovery_master == ctdb->pnn &&
2578             ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2579                 /* Only do the takeover run if the perm disabled or unhealthy
2580                    flags changed since these will cause an ip failover but not
2581                    a recovery.
2582                    If the node became disconnected or banned this will also
2583                    lead to an ip address failover but that is handled
2584                    during recovery
2585                 */
2586                 if (disabled_flag_changed) {
2587                         rec->need_takeover_run = true;
2588                 }
2589         }
2590
2591         talloc_free(tmp_ctx);
2592 }
2593
2594 /*
2595   handler for when we need to push out flag changes ot all other nodes
2596 */
2597 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2598                             TDB_DATA data, void *private_data)
2599 {
2600         int ret;
2601         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2602         struct ctdb_node_map *nodemap=NULL;
2603         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2604         uint32_t recmaster;
2605         uint32_t *nodes;
2606
2607         /* find the recovery master */
2608         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2609         if (ret != 0) {
2610                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2611                 talloc_free(tmp_ctx);
2612                 return;
2613         }
2614
2615         /* read the node flags from the recmaster */
2616         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2617         if (ret != 0) {
2618                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2619                 talloc_free(tmp_ctx);
2620                 return;
2621         }
2622         if (c->pnn >= nodemap->num) {
2623                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2624                 talloc_free(tmp_ctx);
2625                 return;
2626         }
2627
2628         /* send the flags update to all connected nodes */
2629         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2630
2631         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2632                                       nodes, 0, CONTROL_TIMEOUT(),
2633                                       false, data,
2634                                       NULL, NULL,
2635                                       NULL) != 0) {
2636                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2637
2638                 talloc_free(tmp_ctx);
2639                 return;
2640         }
2641
2642         talloc_free(tmp_ctx);
2643 }
2644
2645
2646 struct verify_recmode_normal_data {
2647         uint32_t count;
2648         enum monitor_result status;
2649 };
2650
2651 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2652 {
2653         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2654
2655
2656         /* one more node has responded with recmode data*/
2657         rmdata->count--;
2658
2659         /* if we failed to get the recmode, then return an error and let
2660            the main loop try again.
2661         */
2662         if (state->state != CTDB_CONTROL_DONE) {
2663                 if (rmdata->status == MONITOR_OK) {
2664                         rmdata->status = MONITOR_FAILED;
2665                 }
2666                 return;
2667         }
2668
2669         /* if we got a response, then the recmode will be stored in the
2670            status field
2671         */
2672         if (state->status != CTDB_RECOVERY_NORMAL) {
2673                 DEBUG(DEBUG_NOTICE, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
2674                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2675         }
2676
2677         return;
2678 }
2679
2680
2681 /* verify that all nodes are in normal recovery mode */
2682 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2683 {
2684         struct verify_recmode_normal_data *rmdata;
2685         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2686         struct ctdb_client_control_state *state;
2687         enum monitor_result status;
2688         int j;
2689
2690         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2691         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2692         rmdata->count  = 0;
2693         rmdata->status = MONITOR_OK;
2694
2695         /* loop over all active nodes and send an async getrecmode call to
2696            them*/
2697         for (j=0; j<nodemap->num; j++) {
2698                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2699                         continue;
2700                 }
2701                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2702                                         CONTROL_TIMEOUT(),
2703                                         nodemap->nodes[j].pnn);
2704                 if (state == NULL) {
2705                         /* we failed to send the control, treat this as
2706                            an error and try again next iteration
2707                         */
2708                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2709                         talloc_free(mem_ctx);
2710                         return MONITOR_FAILED;
2711                 }
2712
2713                 /* set up the callback functions */
2714                 state->async.fn = verify_recmode_normal_callback;
2715                 state->async.private_data = rmdata;
2716
2717                 /* one more control to wait for to complete */
2718                 rmdata->count++;
2719         }
2720
2721
2722         /* now wait for up to the maximum number of seconds allowed
2723            or until all nodes we expect a response from has replied
2724         */
2725         while (rmdata->count > 0) {
2726                 event_loop_once(ctdb->ev);
2727         }
2728
2729         status = rmdata->status;
2730         talloc_free(mem_ctx);
2731         return status;
2732 }
2733
2734
2735 struct verify_recmaster_data {
2736         struct ctdb_recoverd *rec;
2737         uint32_t count;
2738         uint32_t pnn;
2739         enum monitor_result status;
2740 };
2741
2742 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2743 {
2744         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2745
2746
2747         /* one more node has responded with recmaster data*/
2748         rmdata->count--;
2749
2750         /* if we failed to get the recmaster, then return an error and let
2751            the main loop try again.
2752         */
2753         if (state->state != CTDB_CONTROL_DONE) {
2754                 if (rmdata->status == MONITOR_OK) {
2755                         rmdata->status = MONITOR_FAILED;
2756                 }
2757                 return;
2758         }
2759
2760         /* if we got a response, then the recmaster will be stored in the
2761            status field
2762         */
2763         if (state->status != rmdata->pnn) {
2764                 DEBUG(DEBUG_ERR,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
2765                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2766                 rmdata->status = MONITOR_ELECTION_NEEDED;
2767         }
2768
2769         return;
2770 }
2771
2772
2773 /* verify that all nodes agree that we are the recmaster */
2774 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2775 {
2776         struct ctdb_context *ctdb = rec->ctdb;
2777         struct verify_recmaster_data *rmdata;
2778         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2779         struct ctdb_client_control_state *state;
2780         enum monitor_result status;
2781         int j;
2782
2783         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2784         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2785         rmdata->rec    = rec;
2786         rmdata->count  = 0;
2787         rmdata->pnn    = pnn;
2788         rmdata->status = MONITOR_OK;
2789
2790         /* loop over all active nodes and send an async getrecmaster call to
2791            them*/
2792         for (j=0; j<nodemap->num; j++) {
2793                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2794                         continue;
2795                 }
2796                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2797                                         CONTROL_TIMEOUT(),
2798                                         nodemap->nodes[j].pnn);
2799                 if (state == NULL) {
2800                         /* we failed to send the control, treat this as
2801                            an error and try again next iteration
2802                         */
2803                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2804                         talloc_free(mem_ctx);
2805                         return MONITOR_FAILED;
2806                 }
2807
2808                 /* set up the callback functions */
2809                 state->async.fn = verify_recmaster_callback;
2810                 state->async.private_data = rmdata;
2811
2812                 /* one more control to wait for to complete */
2813                 rmdata->count++;
2814         }
2815
2816
2817         /* now wait for up to the maximum number of seconds allowed
2818            or until all nodes we expect a response from has replied
2819         */
2820         while (rmdata->count > 0) {
2821                 event_loop_once(ctdb->ev);
2822         }
2823
2824         status = rmdata->status;
2825         talloc_free(mem_ctx);
2826         return status;
2827 }
2828
2829
2830 /* called to check that the local allocation of public ip addresses is ok.
2831 */
2832 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
2833 {
2834         TALLOC_CTX *mem_ctx = talloc_new(NULL);
2835         struct ctdb_control_get_ifaces *ifaces = NULL;
2836         struct ctdb_uptime *uptime1 = NULL;
2837         struct ctdb_uptime *uptime2 = NULL;
2838         int ret, j;
2839         bool need_iface_check = false;
2840         bool need_takeover_run = false;
2841
2842         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2843                                 CTDB_CURRENT_NODE, &uptime1);
2844         if (ret != 0) {
2845                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2846                 talloc_free(mem_ctx);
2847                 return -1;
2848         }
2849
2850
2851         /* read the interfaces from the local node */
2852         ret = ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ifaces);
2853         if (ret != 0) {
2854                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", pnn));
2855                 talloc_free(mem_ctx);
2856                 return -1;
2857         }
2858
2859         if (!rec->ifaces) {
2860                 need_iface_check = true;
2861         } else if (rec->ifaces->num != ifaces->num) {
2862                 need_iface_check = true;
2863         } else if (memcmp(rec->ifaces, ifaces, talloc_get_size(ifaces)) != 0) {
2864                 need_iface_check = true;
2865         }
2866
2867         talloc_free(rec->ifaces);
2868         rec->ifaces = talloc_steal(rec, ifaces);
2869
2870         if (need_iface_check) {
2871                 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
2872                                      "local node %u - force takeover run\n",
2873                                      pnn));
2874                 need_takeover_run = true;
2875         }
2876
2877         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2878                                 CTDB_CURRENT_NODE, &uptime2);
2879         if (ret != 0) {
2880                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
2881                 talloc_free(mem_ctx);
2882                 return -1;
2883         }
2884
2885         /* skip the check if the startrecovery time has changed */
2886         if (timeval_compare(&uptime1->last_recovery_started,
2887                             &uptime2->last_recovery_started) != 0) {
2888                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2889                 talloc_free(mem_ctx);
2890                 return 0;
2891         }
2892
2893         /* skip the check if the endrecovery time has changed */
2894         if (timeval_compare(&uptime1->last_recovery_finished,
2895                             &uptime2->last_recovery_finished) != 0) {
2896                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
2897                 talloc_free(mem_ctx);
2898                 return 0;
2899         }
2900
2901         /* skip the check if we have started but not finished recovery */
2902         if (timeval_compare(&uptime1->last_recovery_finished,
2903                             &uptime1->last_recovery_started) != 1) {
2904                 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
2905                 talloc_free(mem_ctx);
2906
2907                 return 0;
2908         }
2909
2910         /* verify that we have the ip addresses we should have
2911            and we dont have ones we shouldnt have.
2912            if we find an inconsistency we set recmode to
2913            active on the local node and wait for the recmaster
2914            to do a full blown recovery.
2915            also if the pnn is -1 and we are healthy and can host the ip
2916            we also request a ip reallocation.
2917         */
2918         if (ctdb->tunable.disable_ip_failover == 0) {
2919                 struct ctdb_all_public_ips *ips = NULL;
2920
2921                 /* read the *available* IPs from the local node */
2922                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2923                 if (ret != 0) {
2924                         DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
2925                         talloc_free(mem_ctx);
2926                         return -1;
2927                 }
2928
2929                 for (j=0; j<ips->num; j++) {
2930                         if (ips->ips[j].pnn == -1 &&
2931                             nodemap->nodes[pnn].flags == 0) {
2932                                 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
2933                                                   ctdb_addr_to_str(&ips->ips[j].addr)));
2934                                 need_takeover_run = true;
2935                         }
2936                 }
2937
2938                 talloc_free(ips);
2939
2940                 /* read the *known* IPs from the local node */
2941                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2942                 if (ret != 0) {
2943                         DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
2944                         talloc_free(mem_ctx);
2945                         return -1;
2946                 }
2947
2948                 for (j=0; j<ips->num; j++) {
2949                         if (ips->ips[j].pnn == pnn) {
2950                                 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
2951                                         DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
2952                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
2953                                         need_takeover_run = true;
2954                                 }
2955                         } else {
2956                                 if (ctdb->do_checkpublicip &&
2957                                     ctdb_sys_have_ip(&ips->ips[j].addr)) {
2958
2959                                         DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
2960                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
2961
2962                                         if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
2963                                                 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
2964                                         }
2965                                 }
2966                         }
2967                 }
2968         }
2969
2970         if (need_takeover_run) {
2971                 struct takeover_run_reply rd;
2972                 TDB_DATA data;
2973
2974                 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
2975
2976                 rd.pnn = ctdb->pnn;
2977                 rd.srvid = 0;
2978                 data.dptr = (uint8_t *)&rd;
2979                 data.dsize = sizeof(rd);
2980
2981                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2982                 if (ret != 0) {
2983                         DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
2984                 }
2985         }
2986         talloc_free(mem_ctx);
2987         return 0;
2988 }
2989
2990
2991 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2992 {
2993         struct ctdb_node_map **remote_nodemaps = callback_data;
2994
2995         if (node_pnn >= ctdb->num_nodes) {
2996                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2997                 return;
2998         }
2999
3000         remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3001
3002 }
3003
3004 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3005         struct ctdb_node_map *nodemap,
3006         struct ctdb_node_map **remote_nodemaps)
3007 {
3008         uint32_t *nodes;
3009
3010         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3011         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3012                                         nodes, 0,
3013                                         CONTROL_TIMEOUT(), false, tdb_null,
3014                                         async_getnodemap_callback,
3015                                         NULL,
3016                                         remote_nodemaps) != 0) {
3017                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3018
3019                 return -1;
3020         }
3021
3022         return 0;
3023 }
3024
3025 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3026 struct ctdb_check_reclock_state {
3027         struct ctdb_context *ctdb;
3028         struct timeval start_time;
3029         int fd[2];
3030         pid_t child;
3031         struct timed_event *te;
3032         struct fd_event *fde;
3033         enum reclock_child_status status;
3034 };
3035
3036 /* when we free the reclock state we must kill any child process.
3037 */
3038 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3039 {
3040         struct ctdb_context *ctdb = state->ctdb;
3041
3042         ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3043
3044         if (state->fd[0] != -1) {
3045                 close(state->fd[0]);
3046                 state->fd[0] = -1;
3047         }
3048         if (state->fd[1] != -1) {
3049                 close(state->fd[1]);
3050                 state->fd[1] = -1;
3051         }
3052         ctdb_kill(ctdb, state->child, SIGKILL);
3053         return 0;
3054 }
3055
3056 /*
3057   called if our check_reclock child times out. this would happen if
3058   i/o to the reclock file blocks.
3059  */
3060 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3061                                          struct timeval t, void *private_data)
3062 {
3063         struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3064                                            struct ctdb_check_reclock_state);
3065
3066         DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3067         state->status = RECLOCK_TIMEOUT;
3068 }
3069
3070 /* this is called when the child process has completed checking the reclock
3071    file and has written data back to us through the pipe.
3072 */
3073 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3074                              uint16_t flags, void *private_data)
3075 {
3076         struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3077                                              struct ctdb_check_reclock_state);
3078         char c = 0;
3079         int ret;
3080
3081         /* we got a response from our child process so we can abort the
3082            timeout.
3083         */
3084         talloc_free(state->te);
3085         state->te = NULL;
3086
3087         ret = read(state->fd[0], &c, 1);
3088         if (ret != 1 || c != RECLOCK_OK) {
3089                 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3090                 state->status = RECLOCK_FAILED;
3091
3092                 return;
3093         }
3094
3095         state->status = RECLOCK_OK;
3096         return;
3097 }
3098
3099 static int check_recovery_lock(struct ctdb_context *ctdb)
3100 {
3101         int ret;
3102         struct ctdb_check_reclock_state *state;
3103         pid_t parent = getpid();
3104
3105         if (ctdb->recovery_lock_fd == -1) {
3106                 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3107                 return -1;
3108         }
3109
3110         state = talloc(ctdb, struct ctdb_check_reclock_state);
3111         CTDB_NO_MEMORY(ctdb, state);
3112
3113         state->ctdb = ctdb;
3114         state->start_time = timeval_current();
3115         state->status = RECLOCK_CHECKING;
3116         state->fd[0] = -1;
3117         state->fd[1] = -1;
3118
3119         ret = pipe(state->fd);
3120         if (ret != 0) {
3121                 talloc_free(state);
3122                 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3123                 return -1;
3124         }
3125
3126         state->child = ctdb_fork(ctdb);
3127         if (state->child == (pid_t)-1) {
3128                 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3129                 close(state->fd[0]);
3130                 state->fd[0] = -1;
3131                 close(state->fd[1]);
3132                 state->fd[1] = -1;
3133                 talloc_free(state);
3134                 return -1;
3135         }
3136
3137         if (state->child == 0) {
3138                 char cc = RECLOCK_OK;
3139                 close(state->fd[0]);
3140                 state->fd[0] = -1;
3141
3142                 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3143                 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3144                         DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3145                         cc = RECLOCK_FAILED;
3146                 }
3147
3148                 write(state->fd[1], &cc, 1);
3149                 /* make sure we die when our parent dies */
3150                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3151                         sleep(5);
3152                         write(state->fd[1], &cc, 1);
3153                 }
3154                 _exit(0);
3155         }
3156         close(state->fd[1]);
3157         state->fd[1] = -1;
3158         set_close_on_exec(state->fd[0]);
3159
3160         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3161
3162         talloc_set_destructor(state, check_reclock_destructor);
3163
3164         state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3165                                     ctdb_check_reclock_timeout, state);
3166         if (state->te == NULL) {
3167                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3168                 talloc_free(state);
3169                 return -1;
3170         }
3171
3172         state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3173                                 EVENT_FD_READ,
3174                                 reclock_child_handler,
3175                                 (void *)state);
3176
3177         if (state->fde == NULL) {
3178                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3179                 talloc_free(state);
3180                 return -1;
3181         }
3182         tevent_fd_set_auto_close(state->fde);
3183
3184         while (state->status == RECLOCK_CHECKING) {
3185                 event_loop_once(ctdb->ev);
3186         }
3187
3188         if (state->status == RECLOCK_FAILED) {
3189                 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3190                 close(ctdb->recovery_lock_fd);
3191                 ctdb->recovery_lock_fd = -1;
3192                 talloc_free(state);
3193                 return -1;
3194         }
3195
3196         talloc_free(state);
3197         return 0;
3198 }
3199
3200 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3201 {
3202         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3203         const char *reclockfile;
3204
3205         if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3206                 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3207                 talloc_free(tmp_ctx);
3208                 return -1;
3209         }
3210
3211         if (reclockfile == NULL) {
3212                 if (ctdb->recovery_lock_file != NULL) {
3213                         DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3214                         talloc_free(ctdb->recovery_lock_file);
3215                         ctdb->recovery_lock_file = NULL;
3216                         if (ctdb->recovery_lock_fd != -1) {
3217                                 close(ctdb->recovery_lock_fd);
3218                                 ctdb->recovery_lock_fd = -1;
3219                         }
3220                 }
3221                 ctdb->tunable.verify_recovery_lock = 0;
3222                 talloc_free(tmp_ctx);
3223                 return 0;
3224         }
3225
3226         if (ctdb->recovery_lock_file == NULL) {
3227                 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3228                 if (ctdb->recovery_lock_fd != -1) {
3229                         close(ctdb->recovery_lock_fd);
3230                         ctdb->recovery_lock_fd = -1;
3231                 }
3232                 talloc_free(tmp_ctx);
3233                 return 0;
3234         }
3235
3236
3237         if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3238                 talloc_free(tmp_ctx);
3239                 return 0;
3240         }
3241
3242         talloc_free(ctdb->recovery_lock_file);
3243         ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3244         ctdb->tunable.verify_recovery_lock = 0;
3245         if (ctdb->recovery_lock_fd != -1) {
3246                 close(ctdb->recovery_lock_fd);
3247                 ctdb->recovery_lock_fd = -1;
3248         }
3249
3250         talloc_free(tmp_ctx);
3251         return 0;
3252 }
3253
3254 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3255                       TALLOC_CTX *mem_ctx)
3256 {
3257         uint32_t pnn;
3258         struct ctdb_node_map *nodemap=NULL;
3259         struct ctdb_node_map *recmaster_nodemap=NULL;
3260         struct ctdb_node_map **remote_nodemaps=NULL;
3261         struct ctdb_vnn_map *vnnmap=NULL;
3262         struct ctdb_vnn_map *remote_vnnmap=NULL;
3263         int32_t debug_level;
3264         int i, j, ret;
3265
3266
3267
3268         /* verify that the main daemon is still running */
3269         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3270                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3271                 exit(-1);
3272         }
3273
3274         /* ping the local daemon to tell it we are alive */
3275         ctdb_ctrl_recd_ping(ctdb);
3276
3277         if (rec->election_timeout) {
3278                 /* an election is in progress */
3279                 return;
3280         }
3281
3282         /* read the debug level from the parent and update locally */
3283         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3284         if (ret !=0) {
3285                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3286                 return;
3287         }
3288         LogLevel = debug_level;
3289
3290
3291         /* We must check if we need to ban a node here but we want to do this
3292            as early as possible so we dont wait until we have pulled the node
3293            map from the local node. thats why we have the hardcoded value 20
3294         */
3295         for (i=0; i<ctdb->num_nodes; i++) {
3296                 struct ctdb_banning_state *ban_state;
3297
3298                 if (ctdb->nodes[i]->ban_state == NULL) {
3299                         continue;
3300                 }
3301                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
3302                 if (ban_state->count < 20) {
3303                         continue;
3304                 }
3305                 DEBUG(DEBUG_NOTICE,("Node %u has caused %u recoveries recently - banning it for %u seconds\n",
3306                         ctdb->nodes[i]->pnn, ban_state->count,
3307                         ctdb->tunable.recovery_ban_period));
3308                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
3309                 ban_state->count = 0;
3310         }
3311
3312         /* get relevant tunables */
3313         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3314         if (ret != 0) {
3315                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3316                 return;
3317         }
3318
3319         /* get the current recovery lock file from the server */
3320         if (update_recovery_lock_file(ctdb) != 0) {
3321                 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3322                 return;
3323         }
3324
3325         /* Make sure that if recovery lock verification becomes disabled when
3326            we close the file
3327         */
3328         if (ctdb->tunable.verify_recovery_lock == 0) {
3329                 if (ctdb->recovery_lock_fd != -1) {
3330                         close(ctdb->recovery_lock_fd);
3331                         ctdb->recovery_lock_fd = -1;
3332                 }
3333         }
3334
3335         pnn = ctdb_ctrl_getpnn(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3336         if (pnn == (uint32_t)-1) {
3337                 DEBUG(DEBUG_ERR,("Failed to get local pnn - retrying\n"));
3338                 return;
3339         }
3340
3341         /* get the vnnmap */
3342         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3343         if (ret != 0) {
3344                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3345                 return;
3346         }
3347
3348
3349         /* get number of nodes */
3350         if (rec->nodemap) {
3351                 talloc_free(rec->nodemap);
3352                 rec->nodemap = NULL;
3353                 nodemap=NULL;
3354         }
3355         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3356         if (ret != 0) {
3357                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3358                 return;
3359         }
3360         nodemap = rec->nodemap;
3361
3362         /* update the capabilities for all nodes */
3363         ret = update_capabilities(ctdb, nodemap);
3364         if (ret != 0) {
3365                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3366                 return;
3367         }
3368
3369         /* check which node is the recovery master */
3370         ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3371         if (ret != 0) {
3372                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3373                 return;
3374         }
3375
3376         /* if we are not the recmaster we can safely ignore any ip reallocate requests */
3377         if (rec->recmaster != pnn) {
3378                 if (rec->ip_reallocate_ctx != NULL) {
3379                         talloc_free(rec->ip_reallocate_ctx);
3380                         rec->ip_reallocate_ctx = NULL;
3381                         rec->reallocate_callers = NULL;
3382                 }
3383         }
3384
3385         if (rec->recmaster == (uint32_t)-1) {
3386                 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3387                 force_election(rec, pnn, nodemap);
3388                 return;
3389         }
3390
3391         /* if the local daemon is STOPPED, we verify that the databases are
3392            also frozen and thet the recmode is set to active
3393         */
3394         if (nodemap->nodes[pnn].flags & NODE_FLAGS_STOPPED) {
3395                 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3396                 if (ret != 0) {
3397                         DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3398                 }
3399                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3400                         DEBUG(DEBUG_ERR,("Node is stopped but recovery mode is not active. Activate recovery mode and lock databases\n"));
3401
3402                         ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3403                         if (ret != 0) {
3404                                 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED state\n"));
3405                                 return;
3406                         }
3407                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3408                         if (ret != 0) {
3409                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED state\n"));
3410
3411                                 return;
3412                         }
3413                         return;
3414                 }
3415         }
3416         /* If the local node is stopped, verify we are not the recmaster
3417            and yield this role if so
3418         */
3419         if ((nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) && (rec->recmaster == pnn)) {
3420                 DEBUG(DEBUG_ERR,("Local node is INACTIVE. Yielding recmaster role\n"));
3421                 force_election(rec, pnn, nodemap);
3422                 return;
3423         }
3424
3425         /*
3426          * if the current recmaster do not have CTDB_CAP_RECMASTER,
3427          * but we have force an election and try to become the new
3428          * recmaster
3429          */
3430         if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3431             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3432              !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3433                 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3434                                   " but we (node %u) have - force an election\n",
3435                                   rec->recmaster, pnn));
3436                 force_election(rec, pnn, nodemap);
3437                 return;
3438         }
3439
3440         /* check that we (recovery daemon) and the local ctdb daemon
3441            agrees on whether we are banned or not
3442         */
3443 //qqq
3444
3445         /* remember our own node flags */
3446         rec->node_flags = nodemap->nodes[pnn].flags;
3447
3448         /* count how many active nodes there are */
3449         rec->num_active    = 0;
3450         rec->num_connected = 0;
3451         for (i=0; i<nodemap->num; i++) {
3452                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3453                         rec->num_active++;
3454                 }
3455                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3456                         rec->num_connected++;
3457                 }
3458         }
3459
3460
3461         /* verify that the recmaster node is still active */
3462         for (j=0; j<nodemap->num; j++) {
3463                 if (nodemap->nodes[j].pnn==rec->recmaster) {
3464                         break;
3465                 }
3466         }
3467
3468         if (j == nodemap->num) {
3469                 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3470                 force_election(rec, pnn, nodemap);
3471                 return;
3472         }
3473
3474         /* if recovery master is disconnected we must elect a new recmaster */
3475         if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3476                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3477                 force_election(rec, pnn, nodemap);
3478                 return;
3479         }
3480
3481         /* get nodemap from the recovery master to check if it is inactive */
3482         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3483                                    mem_ctx, &recmaster_nodemap);
3484         if (ret != 0) {
3485                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3486                           nodemap->nodes[j].pnn));
3487                 return;
3488         }
3489
3490
3491         if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3492             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3493                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3494                 force_election(rec, pnn, nodemap);
3495                 return;
3496         }
3497
3498         /* If this node is stopped then it is not the recovery master
3499          * so the only remaining action is to potentially to verify
3500          * the local IP allocation below.  This won't accomplish
3501          * anything useful so skip it.
3502          */
3503         if (rec->node_flags & NODE_FLAGS_STOPPED) {
3504                 return;
3505         }
3506
3507         /* verify that we have all ip addresses we should have and we dont
3508          * have addresses we shouldnt have.
3509          */
3510         if (ctdb->tunable.disable_ip_failover == 0) {
3511                 if (rec->ip_check_disable_ctx == NULL) {
3512                         if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3513                                 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3514                         }
3515                 }
3516         }
3517
3518
3519         /* if we are not the recmaster then we do not need to check
3520            if recovery is needed
3521          */
3522         if (pnn != rec->recmaster) {
3523                 return;
3524         }
3525
3526
3527         /* ensure our local copies of flags are right */
3528         ret = update_local_flags(rec, nodemap);
3529         if (ret == MONITOR_ELECTION_NEEDED) {
3530                 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3531                 force_election(rec, pnn, nodemap);
3532                 return;
3533         }
3534         if (ret != MONITOR_OK) {
3535                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3536                 return;
3537         }
3538
3539         if (ctdb->num_nodes != nodemap->num) {
3540                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3541                 reload_nodes_file(ctdb);
3542                 return;
3543         }
3544
3545         /* verify that all active nodes agree that we are the recmaster */
3546         switch (verify_recmaster(rec, nodemap, pnn)) {
3547         case MONITOR_RECOVERY_NEEDED:
3548                 /* can not happen */
3549                 return;
3550         case MONITOR_ELECTION_NEEDED:
3551                 force_election(rec, pnn, nodemap);
3552                 return;
3553         case MONITOR_OK:
3554                 break;
3555         case MONITOR_FAILED:
3556                 return;
3557         }
3558
3559
3560         if (rec->need_recovery) {
3561                 /* a previous recovery didn't finish */
3562                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3563                 return;
3564         }
3565
3566         /* verify that all active nodes are in normal mode
3567            and not in recovery mode
3568         */
3569         switch (verify_recmode(ctdb, nodemap)) {
3570         case MONITOR_RECOVERY_NEEDED:
3571                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3572                 return;
3573         case MONITOR_FAILED:
3574                 return;
3575         case MONITOR_ELECTION_NEEDED:
3576                 /* can not happen */
3577         case MONITOR_OK:
3578                 break;
3579         }
3580
3581
3582         if (ctdb->tunable.verify_recovery_lock != 0) {
3583                 /* we should have the reclock - check its not stale */
3584                 ret = check_recovery_lock(ctdb);
3585                 if (ret != 0) {
3586                         DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3587                         ctdb_set_culprit(rec, ctdb->pnn);
3588                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3589                         return;
3590                 }
3591         }
3592
3593
3594         /* is there a pending reload all ips ? */
3595         if (reload_all_ips_request != NULL) {
3596                 reload_all_ips(ctdb, rec, nodemap, reload_all_ips_request);
3597                 talloc_free(reload_all_ips_request);
3598                 reload_all_ips_request = NULL;
3599         }
3600
3601         /* if there are takeovers requested, perform it and notify the waiters */
3602         if (rec->reallocate_callers) {
3603                 process_ipreallocate_requests(ctdb, rec);
3604         }
3605
3606         /* get the nodemap for all active remote nodes
3607          */
3608         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3609         if (remote_nodemaps == NULL) {
3610                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3611                 return;
3612         }
3613         for(i=0; i<nodemap->num; i++) {
3614                 remote_nodemaps[i] = NULL;
3615         }
3616         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3617                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3618                 return;
3619         }
3620
3621         /* verify that all other nodes have the same nodemap as we have
3622         */
3623         for (j=0; j<nodemap->num; j++) {
3624                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3625                         continue;
3626                 }
3627
3628                 if (remote_nodemaps[j] == NULL) {
3629                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3630                         ctdb_set_culprit(rec, j);
3631
3632                         return;
3633                 }
3634
3635                 /* if the nodes disagree on how many nodes there are
3636                    then this is a good reason to try recovery
3637                  */
3638                 if (remote_nodemaps[j]->num != nodemap->num) {
3639                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3640                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3641                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3642                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3643                         return;
3644                 }
3645
3646                 /* if the nodes disagree on which nodes exist and are
3647                    active, then that is also a good reason to do recovery
3648                  */
3649                 for (i=0;i<nodemap->num;i++) {
3650                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3651                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3652                                           nodemap->nodes[j].pnn, i,
3653                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3654                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3655                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3656                                             vnnmap);
3657                                 return;
3658                         }
3659                 }
3660
3661                 /* verify the flags are consistent
3662                 */
3663                 for (i=0; i<nodemap->num; i++) {
3664                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3665                                 continue;
3666                         }
3667
3668                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3669                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3670                                   nodemap->nodes[j].pnn,
3671                                   nodemap->nodes[i].pnn,
3672                                   remote_nodemaps[j]->nodes[i].flags,
3673                                   nodemap->nodes[j].flags));
3674                                 if (i == j) {
3675                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3676                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3677                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3678                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3679                                                     vnnmap);
3680                                         return;
3681                                 } else {
3682                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3683                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3684                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3685                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3686                                                     vnnmap);
3687                                         return;
3688                                 }
3689                         }
3690                 }
3691         }
3692
3693
3694         /* there better be the same number of lmasters in the vnn map
3695            as there are active nodes or we will have to do a recovery
3696          */
3697         if (vnnmap->size != rec->num_active) {
3698                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active nodes. %u vs %u\n",
3699                           vnnmap->size, rec->num_active));
3700                 ctdb_set_culprit(rec, ctdb->pnn);
3701                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3702                 return;
3703         }
3704
3705         /* verify that all active nodes in the nodemap also exist in
3706            the vnnmap.
3707          */
3708         for (j=0; j<nodemap->num; j++) {
3709                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3710                         continue;
3711                 }
3712                 if (nodemap->nodes[j].pnn == pnn) {
3713                         continue;
3714                 }
3715
3716                 for (i=0; i<vnnmap->size; i++) {
3717                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3718                                 break;
3719                         }
3720                 }
3721                 if (i == vnnmap->size) {
3722                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3723                                   nodemap->nodes[j].pnn));
3724                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3725                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3726                         return;
3727                 }
3728         }
3729
3730
3731         /* verify that all other nodes have the same vnnmap
3732            and are from the same generation
3733          */
3734         for (j=0; j<nodemap->num; j++) {
3735                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3736                         continue;
3737                 }
3738                 if (nodemap->nodes[j].pnn == pnn) {
3739                         continue;
3740                 }
3741
3742                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3743                                           mem_ctx, &remote_vnnmap);
3744                 if (ret != 0) {
3745                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3746                                   nodemap->nodes[j].pnn));
3747                         return;
3748                 }
3749
3750                 /* verify the vnnmap generation is the same */
3751                 if (vnnmap->generation != remote_vnnmap->generation) {
3752                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3753                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3754                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3755                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3756                         return;
3757                 }
3758
3759                 /* verify the vnnmap size is the same */
3760                 if (vnnmap->size != remote_vnnmap->size) {
3761                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3762                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3763                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3764                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3765                         return;
3766                 }
3767
3768                 /* verify the vnnmap is the same */
3769                 for (i=0;i<vnnmap->size;i++) {
3770                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3771                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3772                                           nodemap->nodes[j].pnn));
3773                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3774                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3775                                             vnnmap);
3776                                 return;
3777                         }
3778                 }
3779         }
3780
3781         /* we might need to change who has what IP assigned */
3782         if (rec->need_takeover_run) {
3783                 uint32_t culprit = (uint32_t)-1;
3784
3785                 rec->need_takeover_run = false;
3786
3787                 /* update the list of public ips that a node can handle for
3788                    all connected nodes
3789                 */
3790                 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3791                 if (ret != 0) {
3792                         DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3793                                          culprit));
3794                         rec->need_takeover_run = true;
3795                         return;
3796                 }
3797
3798                 /* execute the "startrecovery" event script on all nodes */
3799                 ret = run_startrecovery_eventscript(rec, nodemap);
3800                 if (ret!=0) {
3801                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3802                         ctdb_set_culprit(rec, ctdb->pnn);
3803                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3804                         return;
3805                 }
3806
3807                 /* If takeover run fails, then the offending nodes are
3808                  * assigned ban culprit counts. And we re-try takeover.
3809                  * If takeover run fails repeatedly, the node would get
3810                  * banned.
3811                  *
3812                  * If rec->need_takeover_run is not set to true at this
3813                  * failure, monitoring is disabled cluster-wide (via
3814                  * startrecovery eventscript) and will not get enabled.
3815                  */
3816                 ret = ctdb_takeover_run(ctdb, nodemap, takeover_fail_callback, rec);
3817                 if (ret != 0) {
3818                         DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. Trying again\n"));
3819                         return;
3820                 }
3821
3822                 /* execute the "recovered" event script on all nodes */
3823                 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3824 #if 0
3825 // we cant check whether the event completed successfully
3826 // since this script WILL fail if the node is in recovery mode
3827 // and if that race happens, the code here would just cause a second
3828 // cascading recovery.
3829                 if (ret!=0) {
3830                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3831                         ctdb_set_culprit(rec, ctdb->pnn);
3832                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3833                 }
3834 #endif
3835         }
3836 }
3837
3838 /*
3839   the main monitoring loop
3840  */
3841 static void monitor_cluster(struct ctdb_context *ctdb)
3842 {
3843         struct ctdb_recoverd *rec;
3844
3845         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3846
3847         rec = talloc_zero(ctdb, struct ctdb_recoverd);
3848         CTDB_NO_MEMORY_FATAL(ctdb, rec);
3849
3850         rec->ctdb = ctdb;
3851
3852         rec->priority_time = timeval_current();
3853
3854         /* register a message port for sending memory dumps */
3855         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3856
3857         /* register a message port for requesting logs */
3858         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
3859
3860         /* register a message port for clearing logs */
3861         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
3862
3863         /* register a message port for recovery elections */
3864         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
3865
3866         /* when nodes are disabled/enabled */
3867         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3868
3869         /* when we are asked to puch out a flag change */
3870         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3871
3872         /* register a message port for vacuum fetch */
3873         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3874
3875         /* register a message port for reloadnodes  */
3876         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3877
3878         /* register a message port for performing a takeover run */
3879         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3880
3881         /* register a message port for performing a reload all ips */
3882         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_ALL_IPS, ip_reloadall_handler, rec);
3883
3884         /* register a message port for disabling the ip check for a short while */
3885         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3886
3887         /* register a message port for updating the recovery daemons node assignment for an ip */
3888         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3889
3890         /* register a message port for forcing a rebalance of a node next
3891            reallocation */
3892         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3893
3894         for (;;) {
3895                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3896                 struct timeval start;
3897                 double elapsed;
3898
3899                 if (!mem_ctx) {
3900                         DEBUG(DEBUG_CRIT,(__location__
3901                                           " Failed to create temp context\n"));
3902                         exit(-1);
3903                 }
3904
3905                 start = timeval_current();
3906                 main_loop(ctdb, rec, mem_ctx);
3907                 talloc_free(mem_ctx);
3908
3909                 /* we only check for recovery once every second */
3910                 elapsed = timeval_elapsed(&start);
3911                 if (elapsed < ctdb->tunable.recover_interval) {
3912                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3913                                           - elapsed);
3914                 }
3915         }
3916 }
3917
3918 /*
3919   event handler for when the main ctdbd dies
3920  */
3921 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
3922                                  uint16_t flags, void *private_data)
3923 {
3924         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3925         _exit(1);
3926 }
3927
3928 /*
3929   called regularly to verify that the recovery daemon is still running
3930  */
3931 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
3932                               struct timeval yt, void *p)
3933 {
3934         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3935
3936         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3937                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3938
3939                 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
3940                                 ctdb_restart_recd, ctdb);
3941
3942                 return;
3943         }
3944
3945         event_add_timed(ctdb->ev, ctdb,
3946                         timeval_current_ofs(30, 0),
3947                         ctdb_check_recd, ctdb);
3948 }
3949
3950 static void recd_sig_child_handler(struct event_context *ev,
3951         struct signal_event *se, int signum, int count,
3952         void *dont_care,
3953         void *private_data)
3954 {
3955 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3956         int status;
3957         pid_t pid = -1;
3958
3959         while (pid != 0) {
3960                 pid = waitpid(-1, &status, WNOHANG);
3961                 if (pid == -1) {
3962                         if (errno != ECHILD) {
3963                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3964                         }
3965                         return;
3966                 }
3967                 if (pid > 0) {
3968                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3969                 }
3970         }
3971 }
3972
3973 /*
3974   startup the recovery daemon as a child of the main ctdb daemon
3975  */
3976 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3977 {
3978         int fd[2];
3979         struct signal_event *se;
3980         struct tevent_fd *fde;
3981
3982         if (pipe(fd) != 0) {
3983                 return -1;
3984         }
3985
3986         ctdb->ctdbd_pid = getpid();
3987
3988         ctdb->recoverd_pid = ctdb_fork(ctdb);
3989         if (ctdb->recoverd_pid == -1) {
3990                 return -1;
3991         }
3992
3993         if (ctdb->recoverd_pid != 0) {
3994                 close(fd[0]);
3995                 event_add_timed(ctdb->ev, ctdb,
3996                                 timeval_current_ofs(30, 0),
3997                                 ctdb_check_recd, ctdb);
3998                 return 0;
3999         }
4000
4001         close(fd[1]);
4002
4003         srandom(getpid() ^ time(NULL));
4004
4005         if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4006                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4007                 exit(1);
4008         }
4009
4010         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4011
4012         fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4013                      ctdb_recoverd_parent, &fd[0]);
4014         tevent_fd_set_auto_close(fde);
4015
4016         /* set up a handler to pick up sigchld */
4017         se = event_add_signal(ctdb->ev, ctdb,
4018                                      SIGCHLD, 0,
4019                                      recd_sig_child_handler,
4020                                      ctdb);
4021         if (se == NULL) {
4022                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4023                 exit(1);
4024         }
4025
4026         monitor_cluster(ctdb);
4027
4028         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4029         return -1;
4030 }
4031
4032 /*
4033   shutdown the recovery daemon
4034  */
4035 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4036 {
4037         if (ctdb->recoverd_pid == 0) {
4038                 return;
4039         }
4040
4041         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4042         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4043 }
4044
4045 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4046                        struct timeval t, void *private_data)
4047 {
4048         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4049
4050         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4051         ctdb_stop_recoverd(ctdb);
4052         ctdb_start_recoverd(ctdb);
4053 }