ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "includes.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25 #include "popt.h"
  26 #include "cmdline.h"
  27 #include "../include/ctdb_client.h"
  28 #include "../include/ctdb_private.h"
  29 #include "db_wrap.h"
  30 #include "dlinklist.h"
  31
  32
  33 /* List of SRVID requests that need to be processed */
  34 struct srvid_list {
  35         struct srvid_list *next, *prev;
  36         struct srvid_request *request;
  37 };
  38
  39 struct srvid_requests {
  40         struct srvid_list *requests;
  41 };
  42
  43 static void srvid_request_reply(struct ctdb_context *ctdb,
  44                                 struct srvid_request *request,
  45                                 TDB_DATA result)
  46 {
  47         /* Someone that sent srvid==0 does not want a reply */
  48         if (request->srvid == 0) {
  49                 talloc_free(request);
  50                 return;
  51         }
  52
  53         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  54                                      result) == 0) {
  55                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  56                                   (unsigned)request->pnn,
  57                                   (unsigned long long)request->srvid));
  58         } else {
  59                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  60                                  (unsigned)request->pnn,
  61                                  (unsigned long long)request->srvid));
  62         }
  63
  64         talloc_free(request);
  65 }
  66
  67 static void srvid_requests_reply(struct ctdb_context *ctdb,
  68                                  struct srvid_requests **requests,
  69                                  TDB_DATA result)
  70 {
  71         struct srvid_list *r;
  72
  73         for (r = (*requests)->requests; r != NULL; r = r->next) {
  74                 srvid_request_reply(ctdb, r->request, result);
  75         }
  76
  77         /* Free the list structure... */
  78         TALLOC_FREE(*requests);
  79 }
  80
  81 static void srvid_request_add(struct ctdb_context *ctdb,
  82                               struct srvid_requests **requests,
  83                               struct srvid_request *request)
  84 {
  85         struct srvid_list *t;
  86         int32_t ret;
  87         TDB_DATA result;
  88
  89         if (*requests == NULL) {
  90                 *requests = talloc_zero(ctdb, struct srvid_requests);
  91                 if (*requests == NULL) {
  92                         goto nomem;
  93                 }
  94         }
  95
  96         t = talloc_zero(*requests, struct srvid_list);
  97         if (t == NULL) {
  98                 /* If *requests was just allocated above then free it */
  99                 if ((*requests)->requests == NULL) {
 100                         TALLOC_FREE(*requests);
 101                 }
 102                 goto nomem;
 103         }
 104
 105         t->request = (struct srvid_request *)talloc_steal(t, request);
 106         DLIST_ADD((*requests)->requests, t);
 107
 108         return;
 109
 110 nomem:
 111         /* Failed to add the request to the list.  Send a fail. */
 112         DEBUG(DEBUG_ERR, (__location__
 113                           " Out of memory, failed to queue SRVID request\n"));
 114         ret = -ENOMEM;
 115         result.dsize = sizeof(ret);
 116         result.dptr = (uint8_t *)&ret;
 117         srvid_request_reply(ctdb, request, result);
 118 }
 119
 120 struct ctdb_banning_state {
 121         uint32_t count;
 122         struct timeval last_reported_time;
 123 };
 124
 125 /*
 126   private state of recovery daemon
 127  */
 128 struct ctdb_recoverd {
 129         struct ctdb_context *ctdb;
 130         uint32_t recmaster;
 131         uint32_t num_active;
 132         uint32_t num_lmasters;
 133         uint32_t num_connected;
 134         uint32_t last_culprit_node;
 135         struct ctdb_node_map *nodemap;
 136         struct timeval priority_time;
 137         bool need_takeover_run;
 138         bool need_recovery;
 139         uint32_t node_flags;
 140         struct timed_event *send_election_te;
 141         struct timed_event *election_timeout;
 142         struct vacuum_info *vacuum_info;
 143         struct srvid_requests *reallocate_requests;
 144         bool takeover_run_in_progress;
 145         TALLOC_CTX *takeover_runs_disable_ctx;
 146         struct ctdb_control_get_ifaces *ifaces;
 147         uint32_t *force_rebalance_nodes;
 148 };
 149
 150 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 151 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 152
 153 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
 154
 155 /*
 156   ban a node for a period of time
 157  */
 158 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 159 {
 160         int ret;
 161         struct ctdb_context *ctdb = rec->ctdb;
 162         struct ctdb_ban_time bantime;
 163
 164         if (!ctdb_validate_pnn(ctdb, pnn)) {
 165                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 166                 return;
 167         }
 168
 169         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 170
 171         bantime.pnn  = pnn;
 172         bantime.time = ban_time;
 173
 174         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 175         if (ret != 0) {
 176                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 177                 return;
 178         }
 179
 180 }
 181
 182 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 183
 184
 185 /*
 186   remember the trouble maker
 187  */
 188 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 189 {
 190         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 191         struct ctdb_banning_state *ban_state;
 192
 193         if (culprit > ctdb->num_nodes) {
 194                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 195                 return;
 196         }
 197
 198         /* If we are banned or stopped, do not set other nodes as culprits */
 199         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 200                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 201                 return;
 202         }
 203
 204         if (ctdb->nodes[culprit]->ban_state == NULL) {
 205                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 206                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 207
 208
 209         }
 210         ban_state = ctdb->nodes[culprit]->ban_state;
 211         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 212                 /* this was the first time in a long while this node
 213                    misbehaved so we will forgive any old transgressions.
 214                 */
 215                 ban_state->count = 0;
 216         }
 217
 218         ban_state->count += count;
 219         ban_state->last_reported_time = timeval_current();
 220         rec->last_culprit_node = culprit;
 221 }
 222
 223 /*
 224   remember the trouble maker
 225  */
 226 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 227 {
 228         ctdb_set_culprit_count(rec, culprit, 1);
 229 }
 230
 231
 232 /* this callback is called for every node that failed to execute the
 233    recovered event
 234 */
 235 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 236 {
 237         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 238
 239         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
 240
 241         ctdb_set_culprit(rec, node_pnn);
 242 }
 243
 244 /*
 245   run the "recovered" eventscript on all nodes
 246  */
 247 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
 248 {
 249         TALLOC_CTX *tmp_ctx;
 250         uint32_t *nodes;
 251         struct ctdb_context *ctdb = rec->ctdb;
 252
 253         tmp_ctx = talloc_new(ctdb);
 254         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 255
 256         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 257         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
 258                                         nodes, 0,
 259                                         CONTROL_TIMEOUT(), false, tdb_null,
 260                                         NULL, recovered_fail_callback,
 261                                         rec) != 0) {
 262                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
 263
 264                 talloc_free(tmp_ctx);
 265                 return -1;
 266         }
 267
 268         talloc_free(tmp_ctx);
 269         return 0;
 270 }
 271
 272 /* this callback is called for every node that failed to execute the
 273    start recovery event
 274 */
 275 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 276 {
 277         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 278
 279         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
 280
 281         ctdb_set_culprit(rec, node_pnn);
 282 }
 283
 284 /*
 285   run the "startrecovery" eventscript on all nodes
 286  */
 287 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
 288 {
 289         TALLOC_CTX *tmp_ctx;
 290         uint32_t *nodes;
 291         struct ctdb_context *ctdb = rec->ctdb;
 292
 293         tmp_ctx = talloc_new(ctdb);
 294         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 295
 296         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 297         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
 298                                         nodes, 0,
 299                                         CONTROL_TIMEOUT(), false, tdb_null,
 300                                         NULL,
 301                                         startrecovery_fail_callback,
 302                                         rec) != 0) {
 303                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
 304                 talloc_free(tmp_ctx);
 305                 return -1;
 306         }
 307
 308         talloc_free(tmp_ctx);
 309         return 0;
 310 }
 311
 312 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 313 {
 314         if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
 315                 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
 316                 return;
 317         }
 318         if (node_pnn < ctdb->num_nodes) {
 319                 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
 320         }
 321
 322         if (node_pnn == ctdb->pnn) {
 323                 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
 324         }
 325 }
 326
 327 /*
 328   update the node capabilities for all connected nodes
 329  */
 330 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 331 {
 332         uint32_t *nodes;
 333         TALLOC_CTX *tmp_ctx;
 334
 335         tmp_ctx = talloc_new(ctdb);
 336         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 337
 338         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
 339         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
 340                                         nodes, 0,
 341                                         CONTROL_TIMEOUT(),
 342                                         false, tdb_null,
 343                                         async_getcap_callback, NULL,
 344                                         NULL) != 0) {
 345                 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
 346                 talloc_free(tmp_ctx);
 347                 return -1;
 348         }
 349
 350         talloc_free(tmp_ctx);
 351         return 0;
 352 }
 353
 354 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 355 {
 356         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 357
 358         DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 359         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 360 }
 361
 362 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 363 {
 364         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 365
 366         DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 367         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 368 }
 369
 370 /*
 371   change recovery mode on all nodes
 372  */
 373 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
 374 {
 375         TDB_DATA data;
 376         uint32_t *nodes;
 377         TALLOC_CTX *tmp_ctx;
 378
 379         tmp_ctx = talloc_new(ctdb);
 380         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 381
 382         /* freeze all nodes */
 383         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 384         if (rec_mode == CTDB_RECOVERY_ACTIVE) {
 385                 int i;
 386
 387                 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
 388                         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
 389                                                 nodes, i,
 390                                                 CONTROL_TIMEOUT(),
 391                                                 false, tdb_null,
 392                                                 NULL,
 393                                                 set_recmode_fail_callback,
 394                                                 rec) != 0) {
 395                                 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
 396                                 talloc_free(tmp_ctx);
 397                                 return -1;
 398                         }
 399                 }
 400         }
 401
 402
 403         data.dsize = sizeof(uint32_t);
 404         data.dptr = (unsigned char *)&rec_mode;
 405
 406         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 407                                         nodes, 0,
 408                                         CONTROL_TIMEOUT(),
 409                                         false, data,
 410                                         NULL, NULL,
 411                                         NULL) != 0) {
 412                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 413                 talloc_free(tmp_ctx);
 414                 return -1;
 415         }
 416
 417         talloc_free(tmp_ctx);
 418         return 0;
 419 }
 420
 421 /*
 422   change recovery master on all node
 423  */
 424 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
 425 {
 426         TDB_DATA data;
 427         TALLOC_CTX *tmp_ctx;
 428         uint32_t *nodes;
 429
 430         tmp_ctx = talloc_new(ctdb);
 431         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 432
 433         data.dsize = sizeof(uint32_t);
 434         data.dptr = (unsigned char *)&pnn;
 435
 436         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 437         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
 438                                         nodes, 0,
 439                                         CONTROL_TIMEOUT(), false, data,
 440                                         NULL, NULL,
 441                                         NULL) != 0) {
 442                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
 443                 talloc_free(tmp_ctx);
 444                 return -1;
 445         }
 446
 447         talloc_free(tmp_ctx);
 448         return 0;
 449 }
 450
 451 /* update all remote nodes to use the same db priority that we have
 452    this can fail if the remove node has not yet been upgraded to
 453    support this function, so we always return success and never fail
 454    a recovery if this call fails.
 455 */
 456 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
 457         struct ctdb_node_map *nodemap,
 458         uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 459 {
 460         int db;
 461
 462         /* step through all local databases */
 463         for (db=0; db<dbmap->num;db++) {
 464                 struct ctdb_db_priority db_prio;
 465                 int ret;
 466
 467                 db_prio.db_id     = dbmap->dbs[db].dbid;
 468                 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
 469                 if (ret != 0) {
 470                         DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
 471                         continue;
 472                 }
 473
 474                 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
 475
 476                 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
 477                                                 CTDB_CURRENT_NODE, &db_prio);
 478                 if (ret != 0) {
 479                         DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
 480                                          db_prio.db_id));
 481                 }
 482         }
 483
 484         return 0;
 485 }
 486
 487 /*
 488   ensure all other nodes have attached to any databases that we have
 489  */
 490 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 491                                            uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 492 {
 493         int i, j, db, ret;
 494         struct ctdb_dbid_map *remote_dbmap;
 495
 496         /* verify that all other nodes have all our databases */
 497         for (j=0; j<nodemap->num; j++) {
 498                 /* we dont need to ourself ourselves */
 499                 if (nodemap->nodes[j].pnn == pnn) {
 500                         continue;
 501                 }
 502                 /* dont check nodes that are unavailable */
 503                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 504                         continue;
 505                 }
 506
 507                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 508                                          mem_ctx, &remote_dbmap);
 509                 if (ret != 0) {
 510                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 511                         return -1;
 512                 }
 513
 514                 /* step through all local databases */
 515                 for (db=0; db<dbmap->num;db++) {
 516                         const char *name;
 517
 518
 519                         for (i=0;i<remote_dbmap->num;i++) {
 520                                 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
 521                                         break;
 522                                 }
 523                         }
 524                         /* the remote node already have this database */
 525                         if (i!=remote_dbmap->num) {
 526                                 continue;
 527                         }
 528                         /* ok so we need to create this database */
 529                         ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
 530                                                   dbmap->dbs[db].dbid, mem_ctx,
 531                                                   &name);
 532                         if (ret != 0) {
 533                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 534                                 return -1;
 535                         }
 536                         ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
 537                                                  nodemap->nodes[j].pnn,
 538                                                  mem_ctx, name,
 539                                                  dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 540                         if (ret != 0) {
 541                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 542                                 return -1;
 543                         }
 544                 }
 545         }
 546
 547         return 0;
 548 }
 549
 550
 551 /*
 552   ensure we are attached to any databases that anyone else is attached to
 553  */
 554 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 555                                           uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
 556 {
 557         int i, j, db, ret;
 558         struct ctdb_dbid_map *remote_dbmap;
 559
 560         /* verify that we have all database any other node has */
 561         for (j=0; j<nodemap->num; j++) {
 562                 /* we dont need to ourself ourselves */
 563                 if (nodemap->nodes[j].pnn == pnn) {
 564                         continue;
 565                 }
 566                 /* dont check nodes that are unavailable */
 567                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 568                         continue;
 569                 }
 570
 571                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 572                                          mem_ctx, &remote_dbmap);
 573                 if (ret != 0) {
 574                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 575                         return -1;
 576                 }
 577
 578                 /* step through all databases on the remote node */
 579                 for (db=0; db<remote_dbmap->num;db++) {
 580                         const char *name;
 581
 582                         for (i=0;i<(*dbmap)->num;i++) {
 583                                 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
 584                                         break;
 585                                 }
 586                         }
 587                         /* we already have this db locally */
 588                         if (i!=(*dbmap)->num) {
 589                                 continue;
 590                         }
 591                         /* ok so we need to create this database and
 592                            rebuild dbmap
 593                          */
 594                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 595                                             remote_dbmap->dbs[db].dbid, mem_ctx, &name);
 596                         if (ret != 0) {
 597                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 598                                           nodemap->nodes[j].pnn));
 599                                 return -1;
 600                         }
 601                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
 602                                            remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 603                         if (ret != 0) {
 604                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 605                                 return -1;
 606                         }
 607                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 608                         if (ret != 0) {
 609                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 610                                 return -1;
 611                         }
 612                 }
 613         }
 614
 615         return 0;
 616 }
 617
 618
 619 /*
 620   pull the remote database contents from one node into the recdb
 621  */
 622 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 623                                     struct tdb_wrap *recdb, uint32_t dbid)
 624 {
 625         int ret;
 626         TDB_DATA outdata;
 627         struct ctdb_marshall_buffer *reply;
 628         struct ctdb_rec_data *rec;
 629         int i;
 630         TALLOC_CTX *tmp_ctx = talloc_new(recdb);
 631
 632         ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
 633                                CONTROL_TIMEOUT(), &outdata);
 634         if (ret != 0) {
 635                 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
 636                 talloc_free(tmp_ctx);
 637                 return -1;
 638         }
 639
 640         reply = (struct ctdb_marshall_buffer *)outdata.dptr;
 641
 642         if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
 643                 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
 644                 talloc_free(tmp_ctx);
 645                 return -1;
 646         }
 647
 648         rec = (struct ctdb_rec_data *)&reply->data[0];
 649
 650         for (i=0;
 651              i<reply->count;
 652              rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
 653                 TDB_DATA key, data;
 654                 struct ctdb_ltdb_header *hdr;
 655                 TDB_DATA existing;
 656
 657                 key.dptr = &rec->data[0];
 658                 key.dsize = rec->keylen;
 659                 data.dptr = &rec->data[key.dsize];
 660                 data.dsize = rec->datalen;
 661
 662                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 663
 664                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 665                         DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
 666                         talloc_free(tmp_ctx);
 667                         return -1;
 668                 }
 669
 670                 /* fetch the existing record, if any */
 671                 existing = tdb_fetch(recdb->tdb, key);
 672
 673                 if (existing.dptr != NULL) {
 674                         struct ctdb_ltdb_header header;
 675                         if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
 676                                 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
 677                                          (unsigned)existing.dsize, srcnode));
 678                                 free(existing.dptr);
 679                                 talloc_free(tmp_ctx);
 680                                 return -1;
 681                         }
 682                         header = *(struct ctdb_ltdb_header *)existing.dptr;
 683                         free(existing.dptr);
 684                         if (!(header.rsn < hdr->rsn ||
 685                               (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
 686                                 continue;
 687                         }
 688                 }
 689
 690                 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
 691                         DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
 692                         talloc_free(tmp_ctx);
 693                         return -1;
 694                 }
 695         }
 696
 697         talloc_free(tmp_ctx);
 698
 699         return 0;
 700 }
 701
 702
 703 struct pull_seqnum_cbdata {
 704         int failed;
 705         uint32_t pnn;
 706         uint64_t seqnum;
 707 };
 708
 709 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 710 {
 711         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 712         uint64_t seqnum;
 713
 714         if (cb_data->failed != 0) {
 715                 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
 716                 return;
 717         }
 718
 719         if (res != 0) {
 720                 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
 721                 cb_data->failed = 1;
 722                 return;
 723         }
 724
 725         if (outdata.dsize != sizeof(uint64_t)) {
 726                 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
 727                 cb_data->failed = -1;
 728                 return;
 729         }
 730
 731         seqnum = *((uint64_t *)outdata.dptr);
 732
 733         if (seqnum > cb_data->seqnum ||
 734             (cb_data->pnn == -1 && seqnum == 0)) {
 735                 cb_data->seqnum = seqnum;
 736                 cb_data->pnn = node_pnn;
 737         }
 738 }
 739
 740 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 741 {
 742         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 743
 744         DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
 745         cb_data->failed = 1;
 746 }
 747
 748 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
 749                                 struct ctdb_recoverd *rec,
 750                                 struct ctdb_node_map *nodemap,
 751                                 struct tdb_wrap *recdb, uint32_t dbid)
 752 {
 753         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
 754         uint32_t *nodes;
 755         TDB_DATA data;
 756         uint32_t outdata[2];
 757         struct pull_seqnum_cbdata *cb_data;
 758
 759         DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
 760
 761         outdata[0] = dbid;
 762         outdata[1] = 0;
 763
 764         data.dsize = sizeof(outdata);
 765         data.dptr  = (uint8_t *)&outdata[0];
 766
 767         cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
 768         if (cb_data == NULL) {
 769                 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
 770                 talloc_free(tmp_ctx);
 771                 return -1;
 772         }
 773
 774         cb_data->failed = 0;
 775         cb_data->pnn    = -1;
 776         cb_data->seqnum = 0;
 777
 778         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 779         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
 780                                         nodes, 0,
 781                                         CONTROL_TIMEOUT(), false, data,
 782                                         pull_seqnum_cb,
 783                                         pull_seqnum_fail_cb,
 784                                         cb_data) != 0) {
 785                 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
 786
 787                 talloc_free(tmp_ctx);
 788                 return -1;
 789         }
 790
 791         if (cb_data->failed != 0) {
 792                 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
 793                 talloc_free(tmp_ctx);
 794                 return -1;
 795         }
 796
 797         if (cb_data->pnn == -1) {
 798                 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
 799                 talloc_free(tmp_ctx);
 800                 return -1;
 801         }
 802
 803         DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
 804
 805         if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
 806                 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
 807                 talloc_free(tmp_ctx);
 808                 return -1;
 809         }
 810
 811         talloc_free(tmp_ctx);
 812         return 0;
 813 }
 814
 815
 816 /*
 817   pull all the remote database contents into the recdb
 818  */
 819 static int pull_remote_database(struct ctdb_context *ctdb,
 820                                 struct ctdb_recoverd *rec,
 821                                 struct ctdb_node_map *nodemap,
 822                                 struct tdb_wrap *recdb, uint32_t dbid,
 823                                 bool persistent)
 824 {
 825         int j;
 826
 827         if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
 828                 int ret;
 829                 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
 830                 if (ret == 0) {
 831                         return 0;
 832                 }
 833         }
 834
 835         /* pull all records from all other nodes across onto this node
 836            (this merges based on rsn)
 837         */
 838         for (j=0; j<nodemap->num; j++) {
 839                 /* dont merge from nodes that are unavailable */
 840                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 841                         continue;
 842                 }
 843                 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 844                         DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
 845                                  nodemap->nodes[j].pnn));
 846                         ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 847                         return -1;
 848                 }
 849         }
 850
 851         return 0;
 852 }
 853
 854
 855 /*
 856   update flags on all active nodes
 857  */
 858 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
 859 {
 860         int ret;
 861
 862         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 863                 if (ret != 0) {
 864                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 865                 return -1;
 866         }
 867
 868         return 0;
 869 }
 870
 871 /*
 872   ensure all nodes have the same vnnmap we do
 873  */
 874 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 875                                       uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
 876 {
 877         int j, ret;
 878
 879         /* push the new vnn map out to all the nodes */
 880         for (j=0; j<nodemap->num; j++) {
 881                 /* dont push to nodes that are unavailable */
 882                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 883                         continue;
 884                 }
 885
 886                 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
 887                 if (ret != 0) {
 888                         DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
 889                         return -1;
 890                 }
 891         }
 892
 893         return 0;
 894 }
 895
 896
 897 struct vacuum_info {
 898         struct vacuum_info *next, *prev;
 899         struct ctdb_recoverd *rec;
 900         uint32_t srcnode;
 901         struct ctdb_db_context *ctdb_db;
 902         struct ctdb_marshall_buffer *recs;
 903         struct ctdb_rec_data *r;
 904 };
 905
 906 static void vacuum_fetch_next(struct vacuum_info *v);
 907
 908 /*
 909   called when a vacuum fetch has completed - just free it and do the next one
 910  */
 911 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 912 {
 913         struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
 914         talloc_free(state);
 915         vacuum_fetch_next(v);
 916 }
 917
 918
 919 /*
 920   process the next element from the vacuum list
 921 */
 922 static void vacuum_fetch_next(struct vacuum_info *v)
 923 {
 924         struct ctdb_call call;
 925         struct ctdb_rec_data *r;
 926
 927         while (v->recs->count) {
 928                 struct ctdb_client_call_state *state;
 929                 TDB_DATA data;
 930                 struct ctdb_ltdb_header *hdr;
 931
 932                 ZERO_STRUCT(call);
 933                 call.call_id = CTDB_NULL_FUNC;
 934                 call.flags = CTDB_IMMEDIATE_MIGRATION;
 935                 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
 936
 937                 r = v->r;
 938                 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
 939                 v->recs->count--;
 940
 941                 call.key.dptr = &r->data[0];
 942                 call.key.dsize = r->keylen;
 943
 944                 /* ensure we don't block this daemon - just skip a record if we can't get
 945                    the chainlock */
 946                 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
 947                         continue;
 948                 }
 949
 950                 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
 951                 if (data.dptr == NULL) {
 952                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 953                         continue;
 954                 }
 955
 956                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 957                         free(data.dptr);
 958                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 959                         continue;
 960                 }
 961
 962                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 963                 if (hdr->dmaster == v->rec->ctdb->pnn) {
 964                         /* its already local */
 965                         free(data.dptr);
 966                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 967                         continue;
 968                 }
 969
 970                 free(data.dptr);
 971
 972                 state = ctdb_call_send(v->ctdb_db, &call);
 973                 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 974                 if (state == NULL) {
 975                         DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
 976                         talloc_free(v);
 977                         return;
 978                 }
 979                 state->async.fn = vacuum_fetch_callback;
 980                 state->async.private_data = v;
 981                 return;
 982         }
 983
 984         talloc_free(v);
 985 }
 986
 987
 988 /*
 989   destroy a vacuum info structure
 990  */
 991 static int vacuum_info_destructor(struct vacuum_info *v)
 992 {
 993         DLIST_REMOVE(v->rec->vacuum_info, v);
 994         return 0;
 995 }
 996
 997
 998 /*
 999   handler for vacuum fetch
1000 */
1001 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1002                                  TDB_DATA data, void *private_data)
1003 {
1004         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1005         struct ctdb_marshall_buffer *recs;
1006         int ret, i;
1007         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1008         const char *name;
1009         struct ctdb_dbid_map *dbmap=NULL;
1010         bool persistent = false;
1011         struct ctdb_db_context *ctdb_db;
1012         struct ctdb_rec_data *r;
1013         uint32_t srcnode;
1014         struct vacuum_info *v;
1015
1016         recs = (struct ctdb_marshall_buffer *)data.dptr;
1017         r = (struct ctdb_rec_data *)&recs->data[0];
1018
1019         if (recs->count == 0) {
1020                 talloc_free(tmp_ctx);
1021                 return;
1022         }
1023
1024         srcnode = r->reqid;
1025
1026         for (v=rec->vacuum_info;v;v=v->next) {
1027                 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1028                         /* we're already working on records from this node */
1029                         talloc_free(tmp_ctx);
1030                         return;
1031                 }
1032         }
1033
1034         /* work out if the database is persistent */
1035         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1036         if (ret != 0) {
1037                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1038                 talloc_free(tmp_ctx);
1039                 return;
1040         }
1041
1042         for (i=0;i<dbmap->num;i++) {
1043                 if (dbmap->dbs[i].dbid == recs->db_id) {
1044                         persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1045                         break;
1046                 }
1047         }
1048         if (i == dbmap->num) {
1049                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1050                 talloc_free(tmp_ctx);
1051                 return;
1052         }
1053
1054         /* find the name of this database */
1055         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1056                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1057                 talloc_free(tmp_ctx);
1058                 return;
1059         }
1060
1061         /* attach to it */
1062         ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1063         if (ctdb_db == NULL) {
1064                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1065                 talloc_free(tmp_ctx);
1066                 return;
1067         }
1068
1069         v = talloc_zero(rec, struct vacuum_info);
1070         if (v == NULL) {
1071                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1072                 talloc_free(tmp_ctx);
1073                 return;
1074         }
1075
1076         v->rec = rec;
1077         v->srcnode = srcnode;
1078         v->ctdb_db = ctdb_db;
1079         v->recs = talloc_memdup(v, recs, data.dsize);
1080         if (v->recs == NULL) {
1081                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1082                 talloc_free(v);
1083                 talloc_free(tmp_ctx);
1084                 return;
1085         }
1086         v->r =  (struct ctdb_rec_data *)&v->recs->data[0];
1087
1088         DLIST_ADD(rec->vacuum_info, v);
1089
1090         talloc_set_destructor(v, vacuum_info_destructor);
1091
1092         vacuum_fetch_next(v);
1093         talloc_free(tmp_ctx);
1094 }
1095
1096
1097 /*
1098   called when ctdb_wait_timeout should finish
1099  */
1100 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1101                               struct timeval yt, void *p)
1102 {
1103         uint32_t *timed_out = (uint32_t *)p;
1104         (*timed_out) = 1;
1105 }
1106
1107 /*
1108   wait for a given number of seconds
1109  */
1110 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1111 {
1112         uint32_t timed_out = 0;
1113         time_t usecs = (secs - (time_t)secs) * 1000000;
1114         event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1115         while (!timed_out) {
1116                 event_loop_once(ctdb->ev);
1117         }
1118 }
1119
1120 /*
1121   called when an election times out (ends)
1122  */
1123 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1124                                   struct timeval t, void *p)
1125 {
1126         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1127         rec->election_timeout = NULL;
1128         fast_start = false;
1129
1130         DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
1131 }
1132
1133
1134 /*
1135   wait for an election to finish. It finished election_timeout seconds after
1136   the last election packet is received
1137  */
1138 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1139 {
1140         struct ctdb_context *ctdb = rec->ctdb;
1141         while (rec->election_timeout) {
1142                 event_loop_once(ctdb->ev);
1143         }
1144 }
1145
1146 /*
1147   Update our local flags from all remote connected nodes.
1148   This is only run when we are or we belive we are the recovery master
1149  */
1150 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1151 {
1152         int j;
1153         struct ctdb_context *ctdb = rec->ctdb;
1154         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1155
1156         /* get the nodemap for all active remote nodes and verify
1157            they are the same as for this node
1158          */
1159         for (j=0; j<nodemap->num; j++) {
1160                 struct ctdb_node_map *remote_nodemap=NULL;
1161                 int ret;
1162
1163                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1164                         continue;
1165                 }
1166                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1167                         continue;
1168                 }
1169
1170                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1171                                            mem_ctx, &remote_nodemap);
1172                 if (ret != 0) {
1173                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1174                                   nodemap->nodes[j].pnn));
1175                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1176                         talloc_free(mem_ctx);
1177                         return MONITOR_FAILED;
1178                 }
1179                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1180                         /* We should tell our daemon about this so it
1181                            updates its flags or else we will log the same
1182                            message again in the next iteration of recovery.
1183                            Since we are the recovery master we can just as
1184                            well update the flags on all nodes.
1185                         */
1186                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1187                         if (ret != 0) {
1188                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1189                                 return -1;
1190                         }
1191
1192                         /* Update our local copy of the flags in the recovery
1193                            daemon.
1194                         */
1195                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1196                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1197                                  nodemap->nodes[j].flags));
1198                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1199                 }
1200                 talloc_free(remote_nodemap);
1201         }
1202         talloc_free(mem_ctx);
1203         return MONITOR_OK;
1204 }
1205
1206
1207 /* Create a new random generation ip.
1208    The generation id can not be the INVALID_GENERATION id
1209 */
1210 static uint32_t new_generation(void)
1211 {
1212         uint32_t generation;
1213
1214         while (1) {
1215                 generation = random();
1216
1217                 if (generation != INVALID_GENERATION) {
1218                         break;
1219                 }
1220         }
1221
1222         return generation;
1223 }
1224
1225
1226 /*
1227   create a temporary working database
1228  */
1229 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1230 {
1231         char *name;
1232         struct tdb_wrap *recdb;
1233         unsigned tdb_flags;
1234
1235         /* open up the temporary recovery database */
1236         name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1237                                ctdb->db_directory_state,
1238                                ctdb->pnn);
1239         if (name == NULL) {
1240                 return NULL;
1241         }
1242         unlink(name);
1243
1244         tdb_flags = TDB_NOLOCK;
1245         if (ctdb->valgrinding) {
1246                 tdb_flags |= TDB_NOMMAP;
1247         }
1248         tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1249
1250         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1251                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1252         if (recdb == NULL) {
1253                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1254         }
1255
1256         talloc_free(name);
1257
1258         return recdb;
1259 }
1260
1261
1262 /*
1263    a traverse function for pulling all relevant records from recdb
1264  */
1265 struct recdb_data {
1266         struct ctdb_context *ctdb;
1267         struct ctdb_marshall_buffer *recdata;
1268         uint32_t len;
1269         uint32_t allocated_len;
1270         bool failed;
1271         bool persistent;
1272 };
1273
1274 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1275 {
1276         struct recdb_data *params = (struct recdb_data *)p;
1277         struct ctdb_rec_data *rec;
1278         struct ctdb_ltdb_header *hdr;
1279
1280         /*
1281          * skip empty records - but NOT for persistent databases:
1282          *
1283          * The record-by-record mode of recovery deletes empty records.
1284          * For persistent databases, this can lead to data corruption
1285          * by deleting records that should be there:
1286          *
1287          * - Assume the cluster has been running for a while.
1288          *
1289          * - A record R in a persistent database has been created and
1290          *   deleted a couple of times, the last operation being deletion,
1291          *   leaving an empty record with a high RSN, say 10.
1292          *
1293          * - Now a node N is turned off.
1294          *
1295          * - This leaves the local database copy of D on N with the empty
1296          *   copy of R and RSN 10. On all other nodes, the recovery has deleted
1297          *   the copy of record R.
1298          *
1299          * - Now the record is created again while node N is turned off.
1300          *   This creates R with RSN = 1 on all nodes except for N.
1301          *
1302          * - Now node N is turned on again. The following recovery will chose
1303          *   the older empty copy of R due to RSN 10 > RSN 1.
1304          *
1305          * ==> Hence the record is gone after the recovery.
1306          *
1307          * On databases like Samba's registry, this can damage the higher-level
1308          * data structures built from the various tdb-level records.
1309          */
1310         if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1311                 return 0;
1312         }
1313
1314         /* update the dmaster field to point to us */
1315         hdr = (struct ctdb_ltdb_header *)data.dptr;
1316         if (!params->persistent) {
1317                 hdr->dmaster = params->ctdb->pnn;
1318                 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1319         }
1320
1321         /* add the record to the blob ready to send to the nodes */
1322         rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1323         if (rec == NULL) {
1324                 params->failed = true;
1325                 return -1;
1326         }
1327         if (params->len + rec->length >= params->allocated_len) {
1328                 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1329                 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1330         }
1331         if (params->recdata == NULL) {
1332                 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1333                          rec->length + params->len));
1334                 params->failed = true;
1335                 return -1;
1336         }
1337         params->recdata->count++;
1338         memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1339         params->len += rec->length;
1340         talloc_free(rec);
1341
1342         return 0;
1343 }
1344
1345 /*
1346   push the recdb database out to all nodes
1347  */
1348 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1349                                bool persistent,
1350                                struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1351 {
1352         struct recdb_data params;
1353         struct ctdb_marshall_buffer *recdata;
1354         TDB_DATA outdata;
1355         TALLOC_CTX *tmp_ctx;
1356         uint32_t *nodes;
1357
1358         tmp_ctx = talloc_new(ctdb);
1359         CTDB_NO_MEMORY(ctdb, tmp_ctx);
1360
1361         recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1362         CTDB_NO_MEMORY(ctdb, recdata);
1363
1364         recdata->db_id = dbid;
1365
1366         params.ctdb = ctdb;
1367         params.recdata = recdata;
1368         params.len = offsetof(struct ctdb_marshall_buffer, data);
1369         params.allocated_len = params.len;
1370         params.failed = false;
1371         params.persistent = persistent;
1372
1373         if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1374                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1375                 talloc_free(params.recdata);
1376                 talloc_free(tmp_ctx);
1377                 return -1;
1378         }
1379
1380         if (params.failed) {
1381                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1382                 talloc_free(params.recdata);
1383                 talloc_free(tmp_ctx);
1384                 return -1;
1385         }
1386
1387         recdata = params.recdata;
1388
1389         outdata.dptr = (void *)recdata;
1390         outdata.dsize = params.len;
1391
1392         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1393         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1394                                         nodes, 0,
1395                                         CONTROL_TIMEOUT(), false, outdata,
1396                                         NULL, NULL,
1397                                         NULL) != 0) {
1398                 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1399                 talloc_free(recdata);
1400                 talloc_free(tmp_ctx);
1401                 return -1;
1402         }
1403
1404         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1405                   dbid, recdata->count));
1406
1407         talloc_free(recdata);
1408         talloc_free(tmp_ctx);
1409
1410         return 0;
1411 }
1412
1413
1414 /*
1415   go through a full recovery on one database
1416  */
1417 static int recover_database(struct ctdb_recoverd *rec,
1418                             TALLOC_CTX *mem_ctx,
1419                             uint32_t dbid,
1420                             bool persistent,
1421                             uint32_t pnn,
1422                             struct ctdb_node_map *nodemap,
1423                             uint32_t transaction_id)
1424 {
1425         struct tdb_wrap *recdb;
1426         int ret;
1427         struct ctdb_context *ctdb = rec->ctdb;
1428         TDB_DATA data;
1429         struct ctdb_control_wipe_database w;
1430         uint32_t *nodes;
1431
1432         recdb = create_recdb(ctdb, mem_ctx);
1433         if (recdb == NULL) {
1434                 return -1;
1435         }
1436
1437         /* pull all remote databases onto the recdb */
1438         ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1439         if (ret != 0) {
1440                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1441                 return -1;
1442         }
1443
1444         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1445
1446         /* wipe all the remote databases. This is safe as we are in a transaction */
1447         w.db_id = dbid;
1448         w.transaction_id = transaction_id;
1449
1450         data.dptr = (void *)&w;
1451         data.dsize = sizeof(w);
1452
1453         nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1454         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1455                                         nodes, 0,
1456                                         CONTROL_TIMEOUT(), false, data,
1457                                         NULL, NULL,
1458                                         NULL) != 0) {
1459                 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1460                 talloc_free(recdb);
1461                 return -1;
1462         }
1463
1464         /* push out the correct database. This sets the dmaster and skips
1465            the empty records */
1466         ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1467         if (ret != 0) {
1468                 talloc_free(recdb);
1469                 return -1;
1470         }
1471
1472         /* all done with this database */
1473         talloc_free(recdb);
1474
1475         return 0;
1476 }
1477
1478 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1479                                          struct ctdb_recoverd *rec,
1480                                          struct ctdb_node_map *nodemap,
1481                                          uint32_t *culprit)
1482 {
1483         int j;
1484         int ret;
1485
1486         if (ctdb->num_nodes != nodemap->num) {
1487                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1488                                   ctdb->num_nodes, nodemap->num));
1489                 if (culprit) {
1490                         *culprit = ctdb->pnn;
1491                 }
1492                 return -1;
1493         }
1494
1495         for (j=0; j<nodemap->num; j++) {
1496                 /* For readability */
1497                 struct ctdb_node *node = ctdb->nodes[j];
1498
1499                 /* release any existing data */
1500                 if (node->known_public_ips) {
1501                         talloc_free(node->known_public_ips);
1502                         node->known_public_ips = NULL;
1503                 }
1504                 if (node->available_public_ips) {
1505                         talloc_free(node->available_public_ips);
1506                         node->available_public_ips = NULL;
1507                 }
1508
1509                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1510                         continue;
1511                 }
1512
1513                 /* Retrieve the list of known public IPs from the node */
1514                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1515                                         CONTROL_TIMEOUT(),
1516                                         node->pnn,
1517                                         ctdb->nodes,
1518                                         0,
1519                                         &node->known_public_ips);
1520                 if (ret != 0) {
1521                         DEBUG(DEBUG_ERR,
1522                               ("Failed to read known public IPs from node: %u\n",
1523                                node->pnn));
1524                         if (culprit) {
1525                                 *culprit = node->pnn;
1526                         }
1527                         return -1;
1528                 }
1529
1530                 if (ctdb->do_checkpublicip &&
1531                     rec->takeover_runs_disable_ctx == NULL &&
1532                     verify_remote_ip_allocation(ctdb,
1533                                                  node->known_public_ips,
1534                                                  node->pnn)) {
1535                         DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1536                         rec->need_takeover_run = true;
1537                 }
1538
1539                 /* Retrieve the list of available public IPs from the node */
1540                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1541                                         CONTROL_TIMEOUT(),
1542                                         node->pnn,
1543                                         ctdb->nodes,
1544                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1545                                         &node->available_public_ips);
1546                 if (ret != 0) {
1547                         DEBUG(DEBUG_ERR,
1548                               ("Failed to read available public IPs from node: %u\n",
1549                                node->pnn));
1550                         if (culprit) {
1551                                 *culprit = node->pnn;
1552                         }
1553                         return -1;
1554                 }
1555         }
1556
1557         return 0;
1558 }
1559
1560 /* when we start a recovery, make sure all nodes use the same reclock file
1561    setting
1562 */
1563 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1564 {
1565         struct ctdb_context *ctdb = rec->ctdb;
1566         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1567         TDB_DATA data;
1568         uint32_t *nodes;
1569
1570         if (ctdb->recovery_lock_file == NULL) {
1571                 data.dptr  = NULL;
1572                 data.dsize = 0;
1573         } else {
1574                 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1575                 data.dptr  = (uint8_t *)ctdb->recovery_lock_file;
1576         }
1577
1578         nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1579         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1580                                         nodes, 0,
1581                                         CONTROL_TIMEOUT(),
1582                                         false, data,
1583                                         NULL, NULL,
1584                                         rec) != 0) {
1585                 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1586                 talloc_free(tmp_ctx);
1587                 return -1;
1588         }
1589
1590         talloc_free(tmp_ctx);
1591         return 0;
1592 }
1593
1594
1595 /*
1596  * this callback is called for every node that failed to execute ctdb_takeover_run()
1597  * and set flag to re-run takeover run.
1598  */
1599 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1600 {
1601         DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1602
1603         if (callback_data != NULL) {
1604                 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1605
1606                 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1607
1608                 ctdb_set_culprit(rec, node_pnn);
1609         }
1610 }
1611
1612
1613 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1614 {
1615         struct ctdb_context *ctdb = rec->ctdb;
1616         int i;
1617         struct ctdb_banning_state *ban_state;
1618
1619         *self_ban = false;
1620         for (i=0; i<ctdb->num_nodes; i++) {
1621                 if (ctdb->nodes[i]->ban_state == NULL) {
1622                         continue;
1623                 }
1624                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1625                 if (ban_state->count < 2*ctdb->num_nodes) {
1626                         continue;
1627                 }
1628
1629                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1630                         ctdb->nodes[i]->pnn, ban_state->count,
1631                         ctdb->tunable.recovery_ban_period));
1632                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1633                 ban_state->count = 0;
1634
1635                 /* Banning ourself? */
1636                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1637                         *self_ban = true;
1638                 }
1639         }
1640 }
1641
1642 static bool do_takeover_run(struct ctdb_recoverd *rec,
1643                             struct ctdb_node_map *nodemap,
1644                             bool banning_credits_on_fail)
1645 {
1646         uint32_t *nodes = NULL;
1647         struct srvid_request_data dtr;
1648         TDB_DATA data;
1649         int i;
1650         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1651         int ret;
1652         bool ok;
1653
1654         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1655
1656         if (rec->takeover_run_in_progress) {
1657                 DEBUG(DEBUG_ERR, (__location__
1658                                   " takeover run already in progress \n"));
1659                 ok = false;
1660                 goto done;
1661         }
1662
1663         rec->takeover_run_in_progress = true;
1664
1665         /* If takeover runs are in disabled then fail... */
1666         if (rec->takeover_runs_disable_ctx != NULL) {
1667                 DEBUG(DEBUG_ERR,
1668                       ("Takeover runs are disabled so refusing to run one\n"));
1669                 ok = false;
1670                 goto done;
1671         }
1672
1673         /* Disable IP checks (takeover runs, really) on other nodes
1674          * while doing this takeover run.  This will stop those other
1675          * nodes from triggering takeover runs when think they should
1676          * be hosting an IP but it isn't yet on an interface.  Don't
1677          * wait for replies since a failure here might cause some
1678          * noise in the logs but will not actually cause a problem.
1679          */
1680         dtr.srvid = 0; /* No reply */
1681         dtr.pnn = -1;
1682
1683         data.dptr  = (uint8_t*)&dtr;
1684         data.dsize = sizeof(dtr);
1685
1686         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1687
1688         /* Disable for 60 seconds.  This can be a tunable later if
1689          * necessary.
1690          */
1691         dtr.data = 60;
1692         for (i = 0; i < talloc_array_length(nodes); i++) {
1693                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1694                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1695                                              data) != 0) {
1696                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1697                 }
1698         }
1699
1700         ret = ctdb_takeover_run(rec->ctdb, nodemap,
1701                                 rec->force_rebalance_nodes,
1702                                 takeover_fail_callback,
1703                                 banning_credits_on_fail ? rec : NULL);
1704
1705         /* Reenable takeover runs and IP checks on other nodes */
1706         dtr.data = 0;
1707         for (i = 0; i < talloc_array_length(nodes); i++) {
1708                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1709                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1710                                              data) != 0) {
1711                         DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1712                 }
1713         }
1714
1715         if (ret != 0) {
1716                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1717                 ok = false;
1718                 goto done;
1719         }
1720
1721         ok = true;
1722         /* Takeover run was successful so clear force rebalance targets */
1723         if (rebalance_nodes == rec->force_rebalance_nodes) {
1724                 TALLOC_FREE(rec->force_rebalance_nodes);
1725         } else {
1726                 DEBUG(DEBUG_WARNING,
1727                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
1728         }
1729 done:
1730         rec->need_takeover_run = !ok;
1731         talloc_free(nodes);
1732         rec->takeover_run_in_progress = false;
1733
1734         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1735         return ok;
1736 }
1737
1738
1739 /*
1740   we are the recmaster, and recovery is needed - start a recovery run
1741  */
1742 static int do_recovery(struct ctdb_recoverd *rec,
1743                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1744                        struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1745 {
1746         struct ctdb_context *ctdb = rec->ctdb;
1747         int i, j, ret;
1748         uint32_t generation;
1749         struct ctdb_dbid_map *dbmap;
1750         TDB_DATA data;
1751         uint32_t *nodes;
1752         struct timeval start_time;
1753         uint32_t culprit = (uint32_t)-1;
1754         bool self_ban;
1755
1756         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1757
1758         /* if recovery fails, force it again */
1759         rec->need_recovery = true;
1760
1761         ban_misbehaving_nodes(rec, &self_ban);
1762         if (self_ban) {
1763                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1764                 return -1;
1765         }
1766
1767         if (ctdb->tunable.verify_recovery_lock != 0) {
1768                 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1769                 start_time = timeval_current();
1770                 if (!ctdb_recovery_lock(ctdb, true)) {
1771                         DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1772                                          "and ban ourself for %u seconds\n",
1773                                          ctdb->tunable.recovery_ban_period));
1774                         ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1775                         return -1;
1776                 }
1777                 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1778                 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1779         }
1780
1781         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1782
1783         /* get a list of all databases */
1784         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1785         if (ret != 0) {
1786                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1787                 return -1;
1788         }
1789
1790         /* we do the db creation before we set the recovery mode, so the freeze happens
1791            on all databases we will be dealing with. */
1792
1793         /* verify that we have all the databases any other node has */
1794         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1795         if (ret != 0) {
1796                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1797                 return -1;
1798         }
1799
1800         /* verify that all other nodes have all our databases */
1801         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1802         if (ret != 0) {
1803                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1804                 return -1;
1805         }
1806         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1807
1808         /* update the database priority for all remote databases */
1809         ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1810         if (ret != 0) {
1811                 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1812         }
1813         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1814
1815
1816         /* update all other nodes to use the same setting for reclock files
1817            as the local recovery master.
1818         */
1819         sync_recovery_lock_file_across_cluster(rec);
1820
1821         /* set recovery mode to active on all nodes */
1822         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1823         if (ret != 0) {
1824                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1825                 return -1;
1826         }
1827
1828         /* execute the "startrecovery" event script on all nodes */
1829         ret = run_startrecovery_eventscript(rec, nodemap);
1830         if (ret!=0) {
1831                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1832                 return -1;
1833         }
1834
1835         /*
1836           update all nodes to have the same flags that we have
1837          */
1838         for (i=0;i<nodemap->num;i++) {
1839                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1840                         continue;
1841                 }
1842
1843                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1844                 if (ret != 0) {
1845                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1846                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1847                         } else {
1848                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1849                                 return -1;
1850                         }
1851                 }
1852         }
1853
1854         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1855
1856         /* pick a new generation number */
1857         generation = new_generation();
1858
1859         /* change the vnnmap on this node to use the new generation
1860            number but not on any other nodes.
1861            this guarantees that if we abort the recovery prematurely
1862            for some reason (a node stops responding?)
1863            that we can just return immediately and we will reenter
1864            recovery shortly again.
1865            I.e. we deliberately leave the cluster with an inconsistent
1866            generation id to allow us to abort recovery at any stage and
1867            just restart it from scratch.
1868          */
1869         vnnmap->generation = generation;
1870         ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1871         if (ret != 0) {
1872                 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1873                 return -1;
1874         }
1875
1876         data.dptr = (void *)&generation;
1877         data.dsize = sizeof(uint32_t);
1878
1879         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1880         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1881                                         nodes, 0,
1882                                         CONTROL_TIMEOUT(), false, data,
1883                                         NULL,
1884                                         transaction_start_fail_callback,
1885                                         rec) != 0) {
1886                 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1887                 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1888                                         nodes, 0,
1889                                         CONTROL_TIMEOUT(), false, tdb_null,
1890                                         NULL,
1891                                         NULL,
1892                                         NULL) != 0) {
1893                         DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1894                 }
1895                 return -1;
1896         }
1897
1898         DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1899
1900         for (i=0;i<dbmap->num;i++) {
1901                 ret = recover_database(rec, mem_ctx,
1902                                        dbmap->dbs[i].dbid,
1903                                        dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1904                                        pnn, nodemap, generation);
1905                 if (ret != 0) {
1906                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1907                         return -1;
1908                 }
1909         }
1910
1911         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1912
1913         /* commit all the changes */
1914         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1915                                         nodes, 0,
1916                                         CONTROL_TIMEOUT(), false, data,
1917                                         NULL, NULL,
1918                                         NULL) != 0) {
1919                 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1920                 return -1;
1921         }
1922
1923         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1924
1925
1926         /* update the capabilities for all nodes */
1927         ret = update_capabilities(ctdb, nodemap);
1928         if (ret!=0) {
1929                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1930                 return -1;
1931         }
1932
1933         /* build a new vnn map with all the currently active and
1934            unbanned nodes */
1935         generation = new_generation();
1936         vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1937         CTDB_NO_MEMORY(ctdb, vnnmap);
1938         vnnmap->generation = generation;
1939         vnnmap->size = 0;
1940         vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1941         CTDB_NO_MEMORY(ctdb, vnnmap->map);
1942         for (i=j=0;i<nodemap->num;i++) {
1943                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1944                         continue;
1945                 }
1946                 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1947                         /* this node can not be an lmaster */
1948                         DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1949                         continue;
1950                 }
1951
1952                 vnnmap->size++;
1953                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1954                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1955                 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1956
1957         }
1958         if (vnnmap->size == 0) {
1959                 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1960                 vnnmap->size++;
1961                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1962                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1963                 vnnmap->map[0] = pnn;
1964         }
1965
1966         /* update to the new vnnmap on all nodes */
1967         ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1968         if (ret != 0) {
1969                 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1970                 return -1;
1971         }
1972
1973         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1974
1975         /* update recmaster to point to us for all nodes */
1976         ret = set_recovery_master(ctdb, nodemap, pnn);
1977         if (ret!=0) {
1978                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
1979                 return -1;
1980         }
1981
1982         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
1983
1984         /* disable recovery mode */
1985         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
1986         if (ret != 0) {
1987                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1988                 return -1;
1989         }
1990
1991         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1992
1993         /* Fetch known/available public IPs from each active node */
1994         ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
1995         if (ret != 0) {
1996                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
1997                                  culprit));
1998                 rec->need_takeover_run = true;
1999                 return -1;
2000         }
2001
2002         do_takeover_run(rec, nodemap, false);
2003
2004         /* execute the "recovered" event script on all nodes */
2005         ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2006         if (ret!=0) {
2007                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2008                 return -1;
2009         }
2010
2011         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2012
2013         /* send a message to all clients telling them that the cluster
2014            has been reconfigured */
2015         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2016                                        CTDB_SRVID_RECONFIGURE, tdb_null);
2017         if (ret != 0) {
2018                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2019                 return -1;
2020         }
2021
2022         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2023
2024         rec->need_recovery = false;
2025
2026         /* we managed to complete a full recovery, make sure to forgive
2027            any past sins by the nodes that could now participate in the
2028            recovery.
2029         */
2030         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2031         for (i=0;i<nodemap->num;i++) {
2032                 struct ctdb_banning_state *ban_state;
2033
2034                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2035                         continue;
2036                 }
2037
2038                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2039                 if (ban_state == NULL) {
2040                         continue;
2041                 }
2042
2043                 ban_state->count = 0;
2044         }
2045
2046
2047         /* We just finished a recovery successfully.
2048            We now wait for rerecovery_timeout before we allow
2049            another recovery to take place.
2050         */
2051         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2052         ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
2053         DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
2054
2055         return 0;
2056 }
2057
2058
2059 /*
2060   elections are won by first checking the number of connected nodes, then
2061   the priority time, then the pnn
2062  */
2063 struct election_message {
2064         uint32_t num_connected;
2065         struct timeval priority_time;
2066         uint32_t pnn;
2067         uint32_t node_flags;
2068 };
2069
2070 /*
2071   form this nodes election data
2072  */
2073 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2074 {
2075         int ret, i;
2076         struct ctdb_node_map *nodemap;
2077         struct ctdb_context *ctdb = rec->ctdb;
2078
2079         ZERO_STRUCTP(em);
2080
2081         em->pnn = rec->ctdb->pnn;
2082         em->priority_time = rec->priority_time;
2083
2084         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2085         if (ret != 0) {
2086                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2087                 return;
2088         }
2089
2090         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2091         em->node_flags = rec->node_flags;
2092
2093         for (i=0;i<nodemap->num;i++) {
2094                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2095                         em->num_connected++;
2096                 }
2097         }
2098
2099         /* we shouldnt try to win this election if we cant be a recmaster */
2100         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2101                 em->num_connected = 0;
2102                 em->priority_time = timeval_current();
2103         }
2104
2105         talloc_free(nodemap);
2106 }
2107
2108 /*
2109   see if the given election data wins
2110  */
2111 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2112 {
2113         struct election_message myem;
2114         int cmp = 0;
2115
2116         ctdb_election_data(rec, &myem);
2117
2118         /* we cant win if we dont have the recmaster capability */
2119         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2120                 return false;
2121         }
2122
2123         /* we cant win if we are banned */
2124         if (rec->node_flags & NODE_FLAGS_BANNED) {
2125                 return false;
2126         }
2127
2128         /* we cant win if we are stopped */
2129         if (rec->node_flags & NODE_FLAGS_STOPPED) {
2130                 return false;
2131         }
2132
2133         /* we will automatically win if the other node is banned */
2134         if (em->node_flags & NODE_FLAGS_BANNED) {
2135                 return true;
2136         }
2137
2138         /* we will automatically win if the other node is banned */
2139         if (em->node_flags & NODE_FLAGS_STOPPED) {
2140                 return true;
2141         }
2142
2143         /* try to use the most connected node */
2144         if (cmp == 0) {
2145                 cmp = (int)myem.num_connected - (int)em->num_connected;
2146         }
2147
2148         /* then the longest running node */
2149         if (cmp == 0) {
2150                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2151         }
2152
2153         if (cmp == 0) {
2154                 cmp = (int)myem.pnn - (int)em->pnn;
2155         }
2156
2157         return cmp > 0;
2158 }
2159
2160 /*
2161   send out an election request
2162  */
2163 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2164 {
2165         int ret;
2166         TDB_DATA election_data;
2167         struct election_message emsg;
2168         uint64_t srvid;
2169         struct ctdb_context *ctdb = rec->ctdb;
2170
2171         srvid = CTDB_SRVID_RECOVERY;
2172
2173         ctdb_election_data(rec, &emsg);
2174
2175         election_data.dsize = sizeof(struct election_message);
2176         election_data.dptr  = (unsigned char *)&emsg;
2177
2178
2179         /* first we assume we will win the election and set
2180            recoverymaster to be ourself on the current node
2181          */
2182         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2183         if (ret != 0) {
2184                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2185                 return -1;
2186         }
2187
2188
2189         /* send an election message to all active nodes */
2190         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2191         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2192 }
2193
2194 /*
2195   this function will unban all nodes in the cluster
2196 */
2197 static void unban_all_nodes(struct ctdb_context *ctdb)
2198 {
2199         int ret, i;
2200         struct ctdb_node_map *nodemap;
2201         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2202
2203         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2204         if (ret != 0) {
2205                 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2206                 return;
2207         }
2208
2209         for (i=0;i<nodemap->num;i++) {
2210                 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2211                   && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2212                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2213                                                  nodemap->nodes[i].pnn, 0,
2214                                                  NODE_FLAGS_BANNED);
2215                         if (ret != 0) {
2216                                 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2217                         }
2218                 }
2219         }
2220
2221         talloc_free(tmp_ctx);
2222 }
2223
2224
2225 /*
2226   we think we are winning the election - send a broadcast election request
2227  */
2228 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2229 {
2230         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2231         int ret;
2232
2233         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2234         if (ret != 0) {
2235                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2236         }
2237
2238         talloc_free(rec->send_election_te);
2239         rec->send_election_te = NULL;
2240 }
2241
2242 /*
2243   handler for memory dumps
2244 */
2245 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2246                              TDB_DATA data, void *private_data)
2247 {
2248         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2249         TDB_DATA *dump;
2250         int ret;
2251         struct srvid_request *rd;
2252
2253         if (data.dsize != sizeof(struct srvid_request)) {
2254                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2255                 talloc_free(tmp_ctx);
2256                 return;
2257         }
2258         rd = (struct srvid_request *)data.dptr;
2259
2260         dump = talloc_zero(tmp_ctx, TDB_DATA);
2261         if (dump == NULL) {
2262                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2263                 talloc_free(tmp_ctx);
2264                 return;
2265         }
2266         ret = ctdb_dump_memory(ctdb, dump);
2267         if (ret != 0) {
2268                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2269                 talloc_free(tmp_ctx);
2270                 return;
2271         }
2272
2273 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2274
2275         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2276         if (ret != 0) {
2277                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2278                 talloc_free(tmp_ctx);
2279                 return;
2280         }
2281
2282         talloc_free(tmp_ctx);
2283 }
2284
2285 /*
2286   handler for getlog
2287 */
2288 static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2289                            TDB_DATA data, void *private_data)
2290 {
2291         struct ctdb_get_log_addr *log_addr;
2292         pid_t child;
2293
2294         if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
2295                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2296                 return;
2297         }
2298         log_addr = (struct ctdb_get_log_addr *)data.dptr;
2299
2300         child = ctdb_fork_no_free_ringbuffer(ctdb);
2301         if (child == (pid_t)-1) {
2302                 DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
2303                 return;
2304         }
2305
2306         if (child == 0) {
2307                 ctdb_set_process_name("ctdb_rec_log_collector");
2308                 if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
2309                         DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
2310                         _exit(1);
2311                 }
2312                 ctdb_collect_log(ctdb, log_addr);
2313                 _exit(0);
2314         }
2315 }
2316
2317 /*
2318   handler for clearlog
2319 */
2320 static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2321                              TDB_DATA data, void *private_data)
2322 {
2323         ctdb_clear_log(ctdb);
2324 }
2325
2326 /*
2327   handler for reload_nodes
2328 */
2329 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2330                              TDB_DATA data, void *private_data)
2331 {
2332         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2333
2334         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2335
2336         ctdb_load_nodes_file(rec->ctdb);
2337 }
2338
2339
2340 static void ctdb_rebalance_timeout(struct event_context *ev,
2341                                    struct timed_event *te,
2342                                    struct timeval t, void *p)
2343 {
2344         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2345
2346         if (rec->force_rebalance_nodes == NULL) {
2347                 DEBUG(DEBUG_ERR,
2348                       ("Rebalance timeout occurred - no nodes to rebalance\n"));
2349                 return;
2350         }
2351
2352         DEBUG(DEBUG_NOTICE,
2353               ("Rebalance timeout occurred - do takeover run\n"));
2354         do_takeover_run(rec, rec->nodemap, false);
2355 }
2356
2357
2358 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2359                                         uint64_t srvid,
2360                                         TDB_DATA data, void *private_data)
2361 {
2362         uint32_t pnn;
2363         uint32_t *t;
2364         int len;
2365         uint32_t deferred_rebalance;
2366         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2367
2368         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2369                 return;
2370         }
2371
2372         if (data.dsize != sizeof(uint32_t)) {
2373                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2374                 return;
2375         }
2376
2377         pnn = *(uint32_t *)&data.dptr[0];
2378
2379         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2380
2381         /* Copy any existing list of nodes.  There's probably some
2382          * sort of realloc variant that will do this but we need to
2383          * make sure that freeing the old array also cancels the timer
2384          * event for the timeout... not sure if realloc will do that.
2385          */
2386         len = (rec->force_rebalance_nodes != NULL) ?
2387                 talloc_array_length(rec->force_rebalance_nodes) :
2388                 0;
2389
2390         /* This allows duplicates to be added but they don't cause
2391          * harm.  A call to add a duplicate PNN arguably means that
2392          * the timeout should be reset, so this is the simplest
2393          * solution.
2394          */
2395         t = talloc_zero_array(rec, uint32_t, len+1);
2396         CTDB_NO_MEMORY_VOID(ctdb, t);
2397         if (len > 0) {
2398                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2399         }
2400         t[len] = pnn;
2401
2402         talloc_free(rec->force_rebalance_nodes);
2403
2404         rec->force_rebalance_nodes = t;
2405
2406         /* If configured, setup a deferred takeover run to make sure
2407          * that certain nodes get IPs rebalanced to them.  This will
2408          * be cancelled if a successful takeover run happens before
2409          * the timeout.  Assign tunable value to variable for
2410          * readability.
2411          */
2412         deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2413         if (deferred_rebalance != 0) {
2414                 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2415                                 timeval_current_ofs(deferred_rebalance, 0),
2416                                 ctdb_rebalance_timeout, rec);
2417         }
2418 }
2419
2420
2421
2422 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2423                              TDB_DATA data, void *private_data)
2424 {
2425         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2426         struct ctdb_public_ip *ip;
2427
2428         if (rec->recmaster != rec->ctdb->pnn) {
2429                 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2430                 return;
2431         }
2432
2433         if (data.dsize != sizeof(struct ctdb_public_ip)) {
2434                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2435                 return;
2436         }
2437
2438         ip = (struct ctdb_public_ip *)data.dptr;
2439
2440         update_ip_assignment_tree(rec->ctdb, ip);
2441 }
2442
2443
2444 static void clear_takeover_runs_disable(struct ctdb_recoverd *rec)
2445 {
2446         TALLOC_FREE(rec->takeover_runs_disable_ctx);
2447 }
2448
2449 static void reenable_takeover_runs(struct event_context *ev,
2450                                    struct timed_event *te,
2451                                    struct timeval yt, void *p)
2452 {
2453         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2454
2455         DEBUG(DEBUG_NOTICE,("Reenabling takeover runs after timeout\n"));
2456         clear_takeover_runs_disable(rec);
2457 }
2458
2459 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2460                                           uint64_t srvid, TDB_DATA data,
2461                                           void *private_data)
2462 {
2463         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2464                                                     struct ctdb_recoverd);
2465         struct srvid_request_data *r;
2466         uint32_t timeout;
2467         TDB_DATA result;
2468         int32_t ret = 0;
2469
2470         /* Validate input data */
2471         if (data.dsize != sizeof(struct srvid_request_data)) {
2472                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2473                                  "expecting %lu\n", (long unsigned)data.dsize,
2474                                  (long unsigned)sizeof(struct srvid_request)));
2475                 return;
2476         }
2477         if (data.dptr == NULL) {
2478                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2479                 return;
2480         }
2481
2482         r = (struct srvid_request_data *)data.dptr;
2483         timeout = r->data;
2484
2485         if (timeout == 0) {
2486                 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs\n"));
2487                 clear_takeover_runs_disable(rec);
2488                 ret = ctdb_get_pnn(ctdb);
2489                 goto done;
2490         }
2491
2492         if (rec->takeover_run_in_progress) {
2493                 DEBUG(DEBUG_ERR,
2494                       ("Unable to disable takeover runs - in progress\n"));
2495                 ret = -EAGAIN;
2496                 goto done;
2497         }
2498
2499         DEBUG(DEBUG_NOTICE,("Disabling takeover runs for %u seconds\n", timeout));
2500
2501         /* Clear any old timers */
2502         clear_takeover_runs_disable(rec);
2503
2504         /* When this is non-NULL it indicates that takeover runs are
2505          * disabled.  This context also holds the timeout timer.
2506          */
2507         rec->takeover_runs_disable_ctx = talloc_new(rec);
2508         if (rec->takeover_runs_disable_ctx == NULL) {
2509                 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate memory\n"));
2510                 ret = -ENOMEM;
2511                 goto done;
2512         }
2513
2514         /* Arrange for the timeout to occur */
2515         event_add_timed(ctdb->ev, rec->takeover_runs_disable_ctx,
2516                         timeval_current_ofs(timeout, 0),
2517                         reenable_takeover_runs,
2518                         rec);
2519
2520         /* Returning our PNN tells the caller that we succeeded */
2521         ret = ctdb_get_pnn(ctdb);
2522 done:
2523         result.dsize = sizeof(int32_t);
2524         result.dptr  = (uint8_t *)&ret;
2525         srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2526 }
2527
2528 /* Backward compatibility for this SRVID - call
2529  * disable_takeover_runs_handler() instead
2530  */
2531 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2532                                      TDB_DATA data, void *private_data)
2533 {
2534         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2535                                                     struct ctdb_recoverd);
2536         TDB_DATA data2;
2537         struct srvid_request_data *req;
2538
2539         if (data.dsize != sizeof(uint32_t)) {
2540                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2541                                  "expecting %lu\n", (long unsigned)data.dsize,
2542                                  (long unsigned)sizeof(uint32_t)));
2543                 return;
2544         }
2545         if (data.dptr == NULL) {
2546                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2547                 return;
2548         }
2549
2550         req = talloc(ctdb, struct srvid_request_data);
2551         CTDB_NO_MEMORY_VOID(ctdb, req);
2552
2553         req->srvid = 0; /* No reply */
2554         req->pnn = -1;
2555         req->data = *((uint32_t *)data.dptr); /* Timeout */
2556
2557         data2.dsize = sizeof(*req);
2558         data2.dptr = (uint8_t *)req;
2559
2560         disable_takeover_runs_handler(rec->ctdb,
2561                                       CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2562                                       data2, rec);
2563 }
2564
2565 /*
2566   handler for ip reallocate, just add it to the list of requests and
2567   handle this later in the monitor_cluster loop so we do not recurse
2568   with other requests to takeover_run()
2569 */
2570 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2571                                   TDB_DATA data, void *private_data)
2572 {
2573         struct srvid_request *request;
2574         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2575                                                     struct ctdb_recoverd);
2576
2577         if (data.dsize != sizeof(struct srvid_request)) {
2578                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2579                 return;
2580         }
2581
2582         request = (struct srvid_request *)data.dptr;
2583
2584         srvid_request_add(ctdb, &rec->reallocate_requests, request);
2585 }
2586
2587 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2588                                           struct ctdb_recoverd *rec)
2589 {
2590         TDB_DATA result;
2591         int32_t ret;
2592         uint32_t culprit;
2593         struct srvid_requests *current;
2594
2595         DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2596
2597         /* Only process requests that are currently pending.  More
2598          * might come in while the takeover run is in progress and
2599          * they will need to be processed later since they might
2600          * be in response flag changes.
2601          */
2602         current = rec->reallocate_requests;
2603         rec->reallocate_requests = NULL;
2604
2605         /* update the list of public ips that a node can handle for
2606            all connected nodes
2607         */
2608         ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2609         if (ret != 0) {
2610                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2611                                  culprit));
2612                 rec->need_takeover_run = true;
2613         }
2614         if (ret == 0) {
2615                 if (do_takeover_run(rec, rec->nodemap, false)) {
2616                         ret = ctdb_get_pnn(ctdb);
2617                 } else {
2618                         ret = -1;
2619                 }
2620         }
2621
2622         result.dsize = sizeof(int32_t);
2623         result.dptr  = (uint8_t *)&ret;
2624
2625         srvid_requests_reply(ctdb, &current, result);
2626 }
2627
2628
2629 /*
2630   handler for recovery master elections
2631 */
2632 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2633                              TDB_DATA data, void *private_data)
2634 {
2635         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2636         int ret;
2637         struct election_message *em = (struct election_message *)data.dptr;
2638         TALLOC_CTX *mem_ctx;
2639
2640         /* Ignore election packets from ourself */
2641         if (ctdb->pnn == em->pnn) {
2642                 return;
2643         }
2644
2645         /* we got an election packet - update the timeout for the election */
2646         talloc_free(rec->election_timeout);
2647         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2648                                                 fast_start ?
2649                                                 timeval_current_ofs(0, 500000) :
2650                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2651                                                 ctdb_election_timeout, rec);
2652
2653         mem_ctx = talloc_new(ctdb);
2654
2655         /* someone called an election. check their election data
2656            and if we disagree and we would rather be the elected node,
2657            send a new election message to all other nodes
2658          */
2659         if (ctdb_election_win(rec, em)) {
2660                 if (!rec->send_election_te) {
2661                         rec->send_election_te = event_add_timed(ctdb->ev, rec,
2662                                                                 timeval_current_ofs(0, 500000),
2663                                                                 election_send_request, rec);
2664                 }
2665                 talloc_free(mem_ctx);
2666                 /*unban_all_nodes(ctdb);*/
2667                 return;
2668         }
2669
2670         /* we didn't win */
2671         talloc_free(rec->send_election_te);
2672         rec->send_election_te = NULL;
2673
2674         if (ctdb->tunable.verify_recovery_lock != 0) {
2675                 /* release the recmaster lock */
2676                 if (em->pnn != ctdb->pnn &&
2677                     ctdb->recovery_lock_fd != -1) {
2678                         close(ctdb->recovery_lock_fd);
2679                         ctdb->recovery_lock_fd = -1;
2680                         unban_all_nodes(ctdb);
2681                 }
2682         }
2683
2684         /* ok, let that guy become recmaster then */
2685         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2686         if (ret != 0) {
2687                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2688                 talloc_free(mem_ctx);
2689                 return;
2690         }
2691
2692         talloc_free(mem_ctx);
2693         return;
2694 }
2695
2696
2697 /*
2698   force the start of the election process
2699  */
2700 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2701                            struct ctdb_node_map *nodemap)
2702 {
2703         int ret;
2704         struct ctdb_context *ctdb = rec->ctdb;
2705
2706         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2707
2708         /* set all nodes to recovery mode to stop all internode traffic */
2709         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2710         if (ret != 0) {
2711                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2712                 return;
2713         }
2714
2715         talloc_free(rec->election_timeout);
2716         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2717                                                 fast_start ?
2718                                                 timeval_current_ofs(0, 500000) :
2719                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2720                                                 ctdb_election_timeout, rec);
2721
2722         ret = send_election_request(rec, pnn);
2723         if (ret!=0) {
2724                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2725                 return;
2726         }
2727
2728         /* wait for a few seconds to collect all responses */
2729         ctdb_wait_election(rec);
2730 }
2731
2732
2733
2734 /*
2735   handler for when a node changes its flags
2736 */
2737 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2738                             TDB_DATA data, void *private_data)
2739 {
2740         int ret;
2741         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2742         struct ctdb_node_map *nodemap=NULL;
2743         TALLOC_CTX *tmp_ctx;
2744         int i;
2745         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2746         int disabled_flag_changed;
2747
2748         if (data.dsize != sizeof(*c)) {
2749                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2750                 return;
2751         }
2752
2753         tmp_ctx = talloc_new(ctdb);
2754         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2755
2756         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2757         if (ret != 0) {
2758                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2759                 talloc_free(tmp_ctx);
2760                 return;
2761         }
2762
2763
2764         for (i=0;i<nodemap->num;i++) {
2765                 if (nodemap->nodes[i].pnn == c->pnn) break;
2766         }
2767
2768         if (i == nodemap->num) {
2769                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2770                 talloc_free(tmp_ctx);
2771                 return;
2772         }
2773
2774         if (c->old_flags != c->new_flags) {
2775                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2776         }
2777
2778         disabled_flag_changed =  (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2779
2780         nodemap->nodes[i].flags = c->new_flags;
2781
2782         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2783                                      CTDB_CURRENT_NODE, &ctdb->recovery_master);
2784
2785         if (ret == 0) {
2786                 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2787                                            CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2788         }
2789
2790         if (ret == 0 &&
2791             ctdb->recovery_master == ctdb->pnn &&
2792             ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2793                 /* Only do the takeover run if the perm disabled or unhealthy
2794                    flags changed since these will cause an ip failover but not
2795                    a recovery.
2796                    If the node became disconnected or banned this will also
2797                    lead to an ip address failover but that is handled
2798                    during recovery
2799                 */
2800                 if (disabled_flag_changed) {
2801                         rec->need_takeover_run = true;
2802                 }
2803         }
2804
2805         talloc_free(tmp_ctx);
2806 }
2807
2808 /*
2809   handler for when we need to push out flag changes ot all other nodes
2810 */
2811 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2812                             TDB_DATA data, void *private_data)
2813 {
2814         int ret;
2815         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2816         struct ctdb_node_map *nodemap=NULL;
2817         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2818         uint32_t recmaster;
2819         uint32_t *nodes;
2820
2821         /* find the recovery master */
2822         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2823         if (ret != 0) {
2824                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2825                 talloc_free(tmp_ctx);
2826                 return;
2827         }
2828
2829         /* read the node flags from the recmaster */
2830         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2831         if (ret != 0) {
2832                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2833                 talloc_free(tmp_ctx);
2834                 return;
2835         }
2836         if (c->pnn >= nodemap->num) {
2837                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2838                 talloc_free(tmp_ctx);
2839                 return;
2840         }
2841
2842         /* send the flags update to all connected nodes */
2843         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2844
2845         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2846                                       nodes, 0, CONTROL_TIMEOUT(),
2847                                       false, data,
2848                                       NULL, NULL,
2849                                       NULL) != 0) {
2850                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2851
2852                 talloc_free(tmp_ctx);
2853                 return;
2854         }
2855
2856         talloc_free(tmp_ctx);
2857 }
2858
2859
2860 struct verify_recmode_normal_data {
2861         uint32_t count;
2862         enum monitor_result status;
2863 };
2864
2865 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2866 {
2867         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2868
2869
2870         /* one more node has responded with recmode data*/
2871         rmdata->count--;
2872
2873         /* if we failed to get the recmode, then return an error and let
2874            the main loop try again.
2875         */
2876         if (state->state != CTDB_CONTROL_DONE) {
2877                 if (rmdata->status == MONITOR_OK) {
2878                         rmdata->status = MONITOR_FAILED;
2879                 }
2880                 return;
2881         }
2882
2883         /* if we got a response, then the recmode will be stored in the
2884            status field
2885         */
2886         if (state->status != CTDB_RECOVERY_NORMAL) {
2887                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2888                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2889         }
2890
2891         return;
2892 }
2893
2894
2895 /* verify that all nodes are in normal recovery mode */
2896 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2897 {
2898         struct verify_recmode_normal_data *rmdata;
2899         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2900         struct ctdb_client_control_state *state;
2901         enum monitor_result status;
2902         int j;
2903
2904         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2905         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2906         rmdata->count  = 0;
2907         rmdata->status = MONITOR_OK;
2908
2909         /* loop over all active nodes and send an async getrecmode call to
2910            them*/
2911         for (j=0; j<nodemap->num; j++) {
2912                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2913                         continue;
2914                 }
2915                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2916                                         CONTROL_TIMEOUT(),
2917                                         nodemap->nodes[j].pnn);
2918                 if (state == NULL) {
2919                         /* we failed to send the control, treat this as
2920                            an error and try again next iteration
2921                         */
2922                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2923                         talloc_free(mem_ctx);
2924                         return MONITOR_FAILED;
2925                 }
2926
2927                 /* set up the callback functions */
2928                 state->async.fn = verify_recmode_normal_callback;
2929                 state->async.private_data = rmdata;
2930
2931                 /* one more control to wait for to complete */
2932                 rmdata->count++;
2933         }
2934
2935
2936         /* now wait for up to the maximum number of seconds allowed
2937            or until all nodes we expect a response from has replied
2938         */
2939         while (rmdata->count > 0) {
2940                 event_loop_once(ctdb->ev);
2941         }
2942
2943         status = rmdata->status;
2944         talloc_free(mem_ctx);
2945         return status;
2946 }
2947
2948
2949 struct verify_recmaster_data {
2950         struct ctdb_recoverd *rec;
2951         uint32_t count;
2952         uint32_t pnn;
2953         enum monitor_result status;
2954 };
2955
2956 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2957 {
2958         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2959
2960
2961         /* one more node has responded with recmaster data*/
2962         rmdata->count--;
2963
2964         /* if we failed to get the recmaster, then return an error and let
2965            the main loop try again.
2966         */
2967         if (state->state != CTDB_CONTROL_DONE) {
2968                 if (rmdata->status == MONITOR_OK) {
2969                         rmdata->status = MONITOR_FAILED;
2970                 }
2971                 return;
2972         }
2973
2974         /* if we got a response, then the recmaster will be stored in the
2975            status field
2976         */
2977         if (state->status != rmdata->pnn) {
2978                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2979                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2980                 rmdata->status = MONITOR_ELECTION_NEEDED;
2981         }
2982
2983         return;
2984 }
2985
2986
2987 /* verify that all nodes agree that we are the recmaster */
2988 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
2989 {
2990         struct ctdb_context *ctdb = rec->ctdb;
2991         struct verify_recmaster_data *rmdata;
2992         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2993         struct ctdb_client_control_state *state;
2994         enum monitor_result status;
2995         int j;
2996
2997         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2998         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2999         rmdata->rec    = rec;
3000         rmdata->count  = 0;
3001         rmdata->pnn    = pnn;
3002         rmdata->status = MONITOR_OK;
3003
3004         /* loop over all active nodes and send an async getrecmaster call to
3005            them*/
3006         for (j=0; j<nodemap->num; j++) {
3007                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3008                         continue;
3009                 }
3010                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3011                                         CONTROL_TIMEOUT(),
3012                                         nodemap->nodes[j].pnn);
3013                 if (state == NULL) {
3014                         /* we failed to send the control, treat this as
3015                            an error and try again next iteration
3016                         */
3017                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3018                         talloc_free(mem_ctx);
3019                         return MONITOR_FAILED;
3020                 }
3021
3022                 /* set up the callback functions */
3023                 state->async.fn = verify_recmaster_callback;
3024                 state->async.private_data = rmdata;
3025
3026                 /* one more control to wait for to complete */
3027                 rmdata->count++;
3028         }
3029
3030
3031         /* now wait for up to the maximum number of seconds allowed
3032            or until all nodes we expect a response from has replied
3033         */
3034         while (rmdata->count > 0) {
3035                 event_loop_once(ctdb->ev);
3036         }
3037
3038         status = rmdata->status;
3039         talloc_free(mem_ctx);
3040         return status;
3041 }
3042
3043 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3044                                     struct ctdb_recoverd *rec)
3045 {
3046         struct ctdb_control_get_ifaces *ifaces = NULL;
3047         TALLOC_CTX *mem_ctx;
3048         bool ret = false;
3049
3050         mem_ctx = talloc_new(NULL);
3051
3052         /* Read the interfaces from the local node */
3053         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3054                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3055                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3056                 /* We could return an error.  However, this will be
3057                  * rare so we'll decide that the interfaces have
3058                  * actually changed, just in case.
3059                  */
3060                 talloc_free(mem_ctx);
3061                 return true;
3062         }
3063
3064         if (!rec->ifaces) {
3065                 /* We haven't been here before so things have changed */
3066                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3067                 ret = true;
3068         } else if (rec->ifaces->num != ifaces->num) {
3069                 /* Number of interfaces has changed */
3070                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3071                                      rec->ifaces->num, ifaces->num));
3072                 ret = true;
3073         } else {
3074                 /* See if interface names or link states have changed */
3075                 int i;
3076                 for (i = 0; i < rec->ifaces->num; i++) {
3077                         struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3078                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3079                                 DEBUG(DEBUG_NOTICE,
3080                                       ("Interface in slot %d changed: %s => %s\n",
3081                                        i, iface->name, ifaces->ifaces[i].name));
3082                                 ret = true;
3083                                 break;
3084                         }
3085                         if (iface->link_state != ifaces->ifaces[i].link_state) {
3086                                 DEBUG(DEBUG_NOTICE,
3087                                       ("Interface %s changed state: %d => %d\n",
3088                                        iface->name, iface->link_state,
3089                                        ifaces->ifaces[i].link_state));
3090                                 ret = true;
3091                                 break;
3092                         }
3093                 }
3094         }
3095
3096         talloc_free(rec->ifaces);
3097         rec->ifaces = talloc_steal(rec, ifaces);
3098
3099         talloc_free(mem_ctx);
3100         return ret;
3101 }
3102
3103 /* called to check that the local allocation of public ip addresses is ok.
3104 */
3105 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3106 {
3107         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3108         struct ctdb_uptime *uptime1 = NULL;
3109         struct ctdb_uptime *uptime2 = NULL;
3110         int ret, j;
3111         bool need_takeover_run = false;
3112
3113         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3114                                 CTDB_CURRENT_NODE, &uptime1);
3115         if (ret != 0) {
3116                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3117                 talloc_free(mem_ctx);
3118                 return -1;
3119         }
3120
3121         if (interfaces_have_changed(ctdb, rec)) {
3122                 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3123                                      "local node %u - force takeover run\n",
3124                                      pnn));
3125                 need_takeover_run = true;
3126         }
3127
3128         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3129                                 CTDB_CURRENT_NODE, &uptime2);
3130         if (ret != 0) {
3131                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3132                 talloc_free(mem_ctx);
3133                 return -1;
3134         }
3135
3136         /* skip the check if the startrecovery time has changed */
3137         if (timeval_compare(&uptime1->last_recovery_started,
3138                             &uptime2->last_recovery_started) != 0) {
3139                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3140                 talloc_free(mem_ctx);
3141                 return 0;
3142         }
3143
3144         /* skip the check if the endrecovery time has changed */
3145         if (timeval_compare(&uptime1->last_recovery_finished,
3146                             &uptime2->last_recovery_finished) != 0) {
3147                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3148                 talloc_free(mem_ctx);
3149                 return 0;
3150         }
3151
3152         /* skip the check if we have started but not finished recovery */
3153         if (timeval_compare(&uptime1->last_recovery_finished,
3154                             &uptime1->last_recovery_started) != 1) {
3155                 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3156                 talloc_free(mem_ctx);
3157
3158                 return 0;
3159         }
3160
3161         /* verify that we have the ip addresses we should have
3162            and we dont have ones we shouldnt have.
3163            if we find an inconsistency we set recmode to
3164            active on the local node and wait for the recmaster
3165            to do a full blown recovery.
3166            also if the pnn is -1 and we are healthy and can host the ip
3167            we also request a ip reallocation.
3168         */
3169         if (ctdb->tunable.disable_ip_failover == 0) {
3170                 struct ctdb_all_public_ips *ips = NULL;
3171
3172                 /* read the *available* IPs from the local node */
3173                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3174                 if (ret != 0) {
3175                         DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3176                         talloc_free(mem_ctx);
3177                         return -1;
3178                 }
3179
3180                 for (j=0; j<ips->num; j++) {
3181                         if (ips->ips[j].pnn == -1 &&
3182                             nodemap->nodes[pnn].flags == 0) {
3183                                 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3184                                                   ctdb_addr_to_str(&ips->ips[j].addr)));
3185                                 need_takeover_run = true;
3186                         }
3187                 }
3188
3189                 talloc_free(ips);
3190
3191                 /* read the *known* IPs from the local node */
3192                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3193                 if (ret != 0) {
3194                         DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3195                         talloc_free(mem_ctx);
3196                         return -1;
3197                 }
3198
3199                 for (j=0; j<ips->num; j++) {
3200                         if (ips->ips[j].pnn == pnn) {
3201                                 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3202                                         DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3203                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3204                                         need_takeover_run = true;
3205                                 }
3206                         } else {
3207                                 if (ctdb->do_checkpublicip &&
3208                                     ctdb_sys_have_ip(&ips->ips[j].addr)) {
3209
3210                                         DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3211                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3212
3213                                         if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3214                                                 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3215                                         }
3216                                 }
3217                         }
3218                 }
3219         }
3220
3221         if (need_takeover_run) {
3222                 struct srvid_request rd;
3223                 TDB_DATA data;
3224
3225                 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3226
3227                 rd.pnn = ctdb->pnn;
3228                 rd.srvid = 0;
3229                 data.dptr = (uint8_t *)&rd;
3230                 data.dsize = sizeof(rd);
3231
3232                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3233                 if (ret != 0) {
3234                         DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3235                 }
3236         }
3237         talloc_free(mem_ctx);
3238         return 0;
3239 }
3240
3241
3242 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3243 {
3244         struct ctdb_node_map **remote_nodemaps = callback_data;
3245
3246         if (node_pnn >= ctdb->num_nodes) {
3247                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3248                 return;
3249         }
3250
3251         remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3252
3253 }
3254
3255 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3256         struct ctdb_node_map *nodemap,
3257         struct ctdb_node_map **remote_nodemaps)
3258 {
3259         uint32_t *nodes;
3260
3261         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3262         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3263                                         nodes, 0,
3264                                         CONTROL_TIMEOUT(), false, tdb_null,
3265                                         async_getnodemap_callback,
3266                                         NULL,
3267                                         remote_nodemaps) != 0) {
3268                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3269
3270                 return -1;
3271         }
3272
3273         return 0;
3274 }
3275
3276 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3277 struct ctdb_check_reclock_state {
3278         struct ctdb_context *ctdb;
3279         struct timeval start_time;
3280         int fd[2];
3281         pid_t child;
3282         struct timed_event *te;
3283         struct fd_event *fde;
3284         enum reclock_child_status status;
3285 };
3286
3287 /* when we free the reclock state we must kill any child process.
3288 */
3289 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3290 {
3291         struct ctdb_context *ctdb = state->ctdb;
3292
3293         ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3294
3295         if (state->fd[0] != -1) {
3296                 close(state->fd[0]);
3297                 state->fd[0] = -1;
3298         }
3299         if (state->fd[1] != -1) {
3300                 close(state->fd[1]);
3301                 state->fd[1] = -1;
3302         }
3303         ctdb_kill(ctdb, state->child, SIGKILL);
3304         return 0;
3305 }
3306
3307 /*
3308   called if our check_reclock child times out. this would happen if
3309   i/o to the reclock file blocks.
3310  */
3311 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3312                                          struct timeval t, void *private_data)
3313 {
3314         struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3315                                            struct ctdb_check_reclock_state);
3316
3317         DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3318         state->status = RECLOCK_TIMEOUT;
3319 }
3320
3321 /* this is called when the child process has completed checking the reclock
3322    file and has written data back to us through the pipe.
3323 */
3324 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3325                              uint16_t flags, void *private_data)
3326 {
3327         struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3328                                              struct ctdb_check_reclock_state);
3329         char c = 0;
3330         int ret;
3331
3332         /* we got a response from our child process so we can abort the
3333            timeout.
3334         */
3335         talloc_free(state->te);
3336         state->te = NULL;
3337
3338         ret = read(state->fd[0], &c, 1);
3339         if (ret != 1 || c != RECLOCK_OK) {
3340                 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3341                 state->status = RECLOCK_FAILED;
3342
3343                 return;
3344         }
3345
3346         state->status = RECLOCK_OK;
3347         return;
3348 }
3349
3350 static int check_recovery_lock(struct ctdb_context *ctdb)
3351 {
3352         int ret;
3353         struct ctdb_check_reclock_state *state;
3354         pid_t parent = getpid();
3355
3356         if (ctdb->recovery_lock_fd == -1) {
3357                 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3358                 return -1;
3359         }
3360
3361         state = talloc(ctdb, struct ctdb_check_reclock_state);
3362         CTDB_NO_MEMORY(ctdb, state);
3363
3364         state->ctdb = ctdb;
3365         state->start_time = timeval_current();
3366         state->status = RECLOCK_CHECKING;
3367         state->fd[0] = -1;
3368         state->fd[1] = -1;
3369
3370         ret = pipe(state->fd);
3371         if (ret != 0) {
3372                 talloc_free(state);
3373                 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3374                 return -1;
3375         }
3376
3377         state->child = ctdb_fork(ctdb);
3378         if (state->child == (pid_t)-1) {
3379                 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3380                 close(state->fd[0]);
3381                 state->fd[0] = -1;
3382                 close(state->fd[1]);
3383                 state->fd[1] = -1;
3384                 talloc_free(state);
3385                 return -1;
3386         }
3387
3388         if (state->child == 0) {
3389                 char cc = RECLOCK_OK;
3390                 close(state->fd[0]);
3391                 state->fd[0] = -1;
3392
3393                 ctdb_set_process_name("ctdb_rec_reclock");
3394                 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3395                 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3396                         DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3397                         cc = RECLOCK_FAILED;
3398                 }
3399
3400                 write(state->fd[1], &cc, 1);
3401                 /* make sure we die when our parent dies */
3402                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3403                         sleep(5);
3404                 }
3405                 _exit(0);
3406         }
3407         close(state->fd[1]);
3408         state->fd[1] = -1;
3409         set_close_on_exec(state->fd[0]);
3410
3411         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3412
3413         talloc_set_destructor(state, check_reclock_destructor);
3414
3415         state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3416                                     ctdb_check_reclock_timeout, state);
3417         if (state->te == NULL) {
3418                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3419                 talloc_free(state);
3420                 return -1;
3421         }
3422
3423         state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3424                                 EVENT_FD_READ,
3425                                 reclock_child_handler,
3426                                 (void *)state);
3427
3428         if (state->fde == NULL) {
3429                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3430                 talloc_free(state);
3431                 return -1;
3432         }
3433         tevent_fd_set_auto_close(state->fde);
3434
3435         while (state->status == RECLOCK_CHECKING) {
3436                 event_loop_once(ctdb->ev);
3437         }
3438
3439         if (state->status == RECLOCK_FAILED) {
3440                 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3441                 close(ctdb->recovery_lock_fd);
3442                 ctdb->recovery_lock_fd = -1;
3443                 talloc_free(state);
3444                 return -1;
3445         }
3446
3447         talloc_free(state);
3448         return 0;
3449 }
3450
3451 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3452 {
3453         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3454         const char *reclockfile;
3455
3456         if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3457                 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3458                 talloc_free(tmp_ctx);
3459                 return -1;
3460         }
3461
3462         if (reclockfile == NULL) {
3463                 if (ctdb->recovery_lock_file != NULL) {
3464                         DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3465                         talloc_free(ctdb->recovery_lock_file);
3466                         ctdb->recovery_lock_file = NULL;
3467                         if (ctdb->recovery_lock_fd != -1) {
3468                                 close(ctdb->recovery_lock_fd);
3469                                 ctdb->recovery_lock_fd = -1;
3470                         }
3471                 }
3472                 ctdb->tunable.verify_recovery_lock = 0;
3473                 talloc_free(tmp_ctx);
3474                 return 0;
3475         }
3476
3477         if (ctdb->recovery_lock_file == NULL) {
3478                 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3479                 if (ctdb->recovery_lock_fd != -1) {
3480                         close(ctdb->recovery_lock_fd);
3481                         ctdb->recovery_lock_fd = -1;
3482                 }
3483                 talloc_free(tmp_ctx);
3484                 return 0;
3485         }
3486
3487
3488         if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3489                 talloc_free(tmp_ctx);
3490                 return 0;
3491         }
3492
3493         talloc_free(ctdb->recovery_lock_file);
3494         ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3495         ctdb->tunable.verify_recovery_lock = 0;
3496         if (ctdb->recovery_lock_fd != -1) {
3497                 close(ctdb->recovery_lock_fd);
3498                 ctdb->recovery_lock_fd = -1;
3499         }
3500
3501         talloc_free(tmp_ctx);
3502         return 0;
3503 }
3504
3505 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3506                       TALLOC_CTX *mem_ctx)
3507 {
3508         uint32_t pnn;
3509         struct ctdb_node_map *nodemap=NULL;
3510         struct ctdb_node_map *recmaster_nodemap=NULL;
3511         struct ctdb_node_map **remote_nodemaps=NULL;
3512         struct ctdb_vnn_map *vnnmap=NULL;
3513         struct ctdb_vnn_map *remote_vnnmap=NULL;
3514         int32_t debug_level;
3515         int i, j, ret;
3516         bool self_ban;
3517
3518
3519         /* verify that the main daemon is still running */
3520         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3521                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3522                 exit(-1);
3523         }
3524
3525         /* ping the local daemon to tell it we are alive */
3526         ctdb_ctrl_recd_ping(ctdb);
3527
3528         if (rec->election_timeout) {
3529                 /* an election is in progress */
3530                 return;
3531         }
3532
3533         /* read the debug level from the parent and update locally */
3534         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3535         if (ret !=0) {
3536                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3537                 return;
3538         }
3539         LogLevel = debug_level;
3540
3541         /* get relevant tunables */
3542         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3543         if (ret != 0) {
3544                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3545                 return;
3546         }
3547
3548         /* get the current recovery lock file from the server */
3549         if (update_recovery_lock_file(ctdb) != 0) {
3550                 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3551                 return;
3552         }
3553
3554         /* Make sure that if recovery lock verification becomes disabled when
3555            we close the file
3556         */
3557         if (ctdb->tunable.verify_recovery_lock == 0) {
3558                 if (ctdb->recovery_lock_fd != -1) {
3559                         close(ctdb->recovery_lock_fd);
3560                         ctdb->recovery_lock_fd = -1;
3561                 }
3562         }
3563
3564         pnn = ctdb_get_pnn(ctdb);
3565
3566         /* get the vnnmap */
3567         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3568         if (ret != 0) {
3569                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3570                 return;
3571         }
3572
3573
3574         /* get number of nodes */
3575         if (rec->nodemap) {
3576                 talloc_free(rec->nodemap);
3577                 rec->nodemap = NULL;
3578                 nodemap=NULL;
3579         }
3580         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3581         if (ret != 0) {
3582                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3583                 return;
3584         }
3585         nodemap = rec->nodemap;
3586
3587         /* remember our own node flags */
3588         rec->node_flags = nodemap->nodes[pnn].flags;
3589
3590         ban_misbehaving_nodes(rec, &self_ban);
3591         if (self_ban) {
3592                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3593                 return;
3594         }
3595
3596         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3597            also frozen and that the recmode is set to active.
3598         */
3599         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3600                 /* If this node has become inactive then we want to
3601                  * reduce the chances of it taking over the recovery
3602                  * master role when it becomes active again.  This
3603                  * helps to stabilise the recovery master role so that
3604                  * it stays on the most stable node.
3605                  */
3606                 rec->priority_time = timeval_current();
3607
3608                 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3609                 if (ret != 0) {
3610                         DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3611                 }
3612                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3613                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3614
3615                         ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
3616                         if (ret != 0) {
3617                                 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3618                                 return;
3619                         }
3620                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3621                         if (ret != 0) {
3622                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3623
3624                                 return;
3625                         }
3626                 }
3627
3628                 /* If this node is stopped or banned then it is not the recovery
3629                  * master, so don't do anything. This prevents stopped or banned
3630                  * node from starting election and sending unnecessary controls.
3631                  */
3632                 return;
3633         }
3634
3635         /* check which node is the recovery master */
3636         ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3637         if (ret != 0) {
3638                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3639                 return;
3640         }
3641
3642         /* If we are not the recmaster then do some housekeeping */
3643         if (rec->recmaster != pnn) {
3644                 /* Ignore any IP reallocate requests - only recmaster
3645                  * processes them
3646                  */
3647                 TALLOC_FREE(rec->reallocate_requests);
3648                 /* Clear any nodes that should be force rebalanced in
3649                  * the next takeover run.  If the recovery master role
3650                  * has moved then we don't want to process these some
3651                  * time in the future.
3652                  */
3653                 TALLOC_FREE(rec->force_rebalance_nodes);
3654         }
3655
3656         /* This is a special case.  When recovery daemon is started, recmaster
3657          * is set to -1.  If a node is not started in stopped state, then
3658          * start election to decide recovery master
3659          */
3660         if (rec->recmaster == (uint32_t)-1) {
3661                 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3662                 force_election(rec, pnn, nodemap);
3663                 return;
3664         }
3665
3666         /* update the capabilities for all nodes */
3667         ret = update_capabilities(ctdb, nodemap);
3668         if (ret != 0) {
3669                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3670                 return;
3671         }
3672
3673         /*
3674          * If the current recmaster does not have CTDB_CAP_RECMASTER,
3675          * but we have, then force an election and try to become the new
3676          * recmaster.
3677          */
3678         if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3679             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3680              !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3681                 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3682                                   " but we (node %u) have - force an election\n",
3683                                   rec->recmaster, pnn));
3684                 force_election(rec, pnn, nodemap);
3685                 return;
3686         }
3687
3688         /* count how many active nodes there are */
3689         rec->num_active    = 0;
3690         rec->num_lmasters  = 0;
3691         rec->num_connected = 0;
3692         for (i=0; i<nodemap->num; i++) {
3693                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3694                         rec->num_active++;
3695                         if (rec->ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER) {
3696                                 rec->num_lmasters++;
3697                         }
3698                 }
3699                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3700                         rec->num_connected++;
3701                 }
3702         }
3703
3704
3705         /* verify that the recmaster node is still active */
3706         for (j=0; j<nodemap->num; j++) {
3707                 if (nodemap->nodes[j].pnn==rec->recmaster) {
3708                         break;
3709                 }
3710         }
3711
3712         if (j == nodemap->num) {
3713                 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3714                 force_election(rec, pnn, nodemap);
3715                 return;
3716         }
3717
3718         /* if recovery master is disconnected we must elect a new recmaster */
3719         if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3720                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3721                 force_election(rec, pnn, nodemap);
3722                 return;
3723         }
3724
3725         /* get nodemap from the recovery master to check if it is inactive */
3726         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3727                                    mem_ctx, &recmaster_nodemap);
3728         if (ret != 0) {
3729                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3730                           nodemap->nodes[j].pnn));
3731                 return;
3732         }
3733
3734
3735         if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3736             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3737                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3738                 /*
3739                  * update our nodemap to carry the recmaster's notion of
3740                  * its own flags, so that we don't keep freezing the
3741                  * inactive recmaster node...
3742                  */
3743                 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3744                 force_election(rec, pnn, nodemap);
3745                 return;
3746         }
3747
3748         /* verify that we have all ip addresses we should have and we dont
3749          * have addresses we shouldnt have.
3750          */
3751         if (ctdb->tunable.disable_ip_failover == 0 &&
3752             rec->takeover_runs_disable_ctx == NULL) {
3753                 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3754                         DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3755                 }
3756         }
3757
3758
3759         /* if we are not the recmaster then we do not need to check
3760            if recovery is needed
3761          */
3762         if (pnn != rec->recmaster) {
3763                 return;
3764         }
3765
3766
3767         /* ensure our local copies of flags are right */
3768         ret = update_local_flags(rec, nodemap);
3769         if (ret == MONITOR_ELECTION_NEEDED) {
3770                 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3771                 force_election(rec, pnn, nodemap);
3772                 return;
3773         }
3774         if (ret != MONITOR_OK) {
3775                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3776                 return;
3777         }
3778
3779         if (ctdb->num_nodes != nodemap->num) {
3780                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3781                 ctdb_load_nodes_file(ctdb);
3782                 return;
3783         }
3784
3785         /* verify that all active nodes agree that we are the recmaster */
3786         switch (verify_recmaster(rec, nodemap, pnn)) {
3787         case MONITOR_RECOVERY_NEEDED:
3788                 /* can not happen */
3789                 return;
3790         case MONITOR_ELECTION_NEEDED:
3791                 force_election(rec, pnn, nodemap);
3792                 return;
3793         case MONITOR_OK:
3794                 break;
3795         case MONITOR_FAILED:
3796                 return;
3797         }
3798
3799
3800         if (rec->need_recovery) {
3801                 /* a previous recovery didn't finish */
3802                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3803                 return;
3804         }
3805
3806         /* verify that all active nodes are in normal mode
3807            and not in recovery mode
3808         */
3809         switch (verify_recmode(ctdb, nodemap)) {
3810         case MONITOR_RECOVERY_NEEDED:
3811                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3812                 return;
3813         case MONITOR_FAILED:
3814                 return;
3815         case MONITOR_ELECTION_NEEDED:
3816                 /* can not happen */
3817         case MONITOR_OK:
3818                 break;
3819         }
3820
3821
3822         if (ctdb->tunable.verify_recovery_lock != 0) {
3823                 /* we should have the reclock - check its not stale */
3824                 ret = check_recovery_lock(ctdb);
3825                 if (ret != 0) {
3826                         DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3827                         ctdb_set_culprit(rec, ctdb->pnn);
3828                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3829                         return;
3830                 }
3831         }
3832
3833
3834         /* if there are takeovers requested, perform it and notify the waiters */
3835         if (rec->takeover_runs_disable_ctx == NULL &&
3836             rec->reallocate_requests) {
3837                 process_ipreallocate_requests(ctdb, rec);
3838         }
3839
3840         /* get the nodemap for all active remote nodes
3841          */
3842         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3843         if (remote_nodemaps == NULL) {
3844                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3845                 return;
3846         }
3847         for(i=0; i<nodemap->num; i++) {
3848                 remote_nodemaps[i] = NULL;
3849         }
3850         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3851                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3852                 return;
3853         }
3854
3855         /* verify that all other nodes have the same nodemap as we have
3856         */
3857         for (j=0; j<nodemap->num; j++) {
3858                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3859                         continue;
3860                 }
3861
3862                 if (remote_nodemaps[j] == NULL) {
3863                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3864                         ctdb_set_culprit(rec, j);
3865
3866                         return;
3867                 }
3868
3869                 /* if the nodes disagree on how many nodes there are
3870                    then this is a good reason to try recovery
3871                  */
3872                 if (remote_nodemaps[j]->num != nodemap->num) {
3873                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3874                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3875                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3876                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3877                         return;
3878                 }
3879
3880                 /* if the nodes disagree on which nodes exist and are
3881                    active, then that is also a good reason to do recovery
3882                  */
3883                 for (i=0;i<nodemap->num;i++) {
3884                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3885                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3886                                           nodemap->nodes[j].pnn, i,
3887                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3888                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3889                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3890                                             vnnmap);
3891                                 return;
3892                         }
3893                 }
3894         }
3895
3896         /*
3897          * Update node flags obtained from each active node. This ensure we have
3898          * up-to-date information for all the nodes.
3899          */
3900         for (j=0; j<nodemap->num; j++) {
3901                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3902                         continue;
3903                 }
3904                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3905         }
3906
3907         for (j=0; j<nodemap->num; j++) {
3908                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3909                         continue;
3910                 }
3911
3912                 /* verify the flags are consistent
3913                 */
3914                 for (i=0; i<nodemap->num; i++) {
3915                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3916                                 continue;
3917                         }
3918
3919                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3920                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3921                                   nodemap->nodes[j].pnn,
3922                                   nodemap->nodes[i].pnn,
3923                                   remote_nodemaps[j]->nodes[i].flags,
3924                                   nodemap->nodes[i].flags));
3925                                 if (i == j) {
3926                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3927                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3928                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3929                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3930                                                     vnnmap);
3931                                         return;
3932                                 } else {
3933                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3934                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3935                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3936                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3937                                                     vnnmap);
3938                                         return;
3939                                 }
3940                         }
3941                 }
3942         }
3943
3944
3945         /* There must be the same number of lmasters in the vnn map as
3946          * there are active nodes with the lmaster capability...  or
3947          * do a recovery.
3948          */
3949         if (vnnmap->size != rec->num_lmasters) {
3950                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3951                           vnnmap->size, rec->num_lmasters));
3952                 ctdb_set_culprit(rec, ctdb->pnn);
3953                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3954                 return;
3955         }
3956
3957         /* verify that all active nodes in the nodemap also exist in
3958            the vnnmap.
3959          */
3960         for (j=0; j<nodemap->num; j++) {
3961                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3962                         continue;
3963                 }
3964                 if (nodemap->nodes[j].pnn == pnn) {
3965                         continue;
3966                 }
3967
3968                 for (i=0; i<vnnmap->size; i++) {
3969                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3970                                 break;
3971                         }
3972                 }
3973                 if (i == vnnmap->size) {
3974                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3975                                   nodemap->nodes[j].pnn));
3976                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3977                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3978                         return;
3979                 }
3980         }
3981
3982
3983         /* verify that all other nodes have the same vnnmap
3984            and are from the same generation
3985          */
3986         for (j=0; j<nodemap->num; j++) {
3987                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3988                         continue;
3989                 }
3990                 if (nodemap->nodes[j].pnn == pnn) {
3991                         continue;
3992                 }
3993
3994                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3995                                           mem_ctx, &remote_vnnmap);
3996                 if (ret != 0) {
3997                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3998                                   nodemap->nodes[j].pnn));
3999                         return;
4000                 }
4001
4002                 /* verify the vnnmap generation is the same */
4003                 if (vnnmap->generation != remote_vnnmap->generation) {
4004                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
4005                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
4006                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4007                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4008                         return;
4009                 }
4010
4011                 /* verify the vnnmap size is the same */
4012                 if (vnnmap->size != remote_vnnmap->size) {
4013                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
4014                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
4015                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4016                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4017                         return;
4018                 }
4019
4020                 /* verify the vnnmap is the same */
4021                 for (i=0;i<vnnmap->size;i++) {
4022                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
4023                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
4024                                           nodemap->nodes[j].pnn));
4025                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4026                                 do_recovery(rec, mem_ctx, pnn, nodemap,
4027                                             vnnmap);
4028                                 return;
4029                         }
4030                 }
4031         }
4032
4033         /* we might need to change who has what IP assigned */
4034         if (rec->need_takeover_run) {
4035                 uint32_t culprit = (uint32_t)-1;
4036
4037                 rec->need_takeover_run = false;
4038
4039                 /* update the list of public ips that a node can handle for
4040                    all connected nodes
4041                 */
4042                 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
4043                 if (ret != 0) {
4044                         DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
4045                                          culprit));
4046                         rec->need_takeover_run = true;
4047                         return;
4048                 }
4049
4050                 /* execute the "startrecovery" event script on all nodes */
4051                 ret = run_startrecovery_eventscript(rec, nodemap);
4052                 if (ret!=0) {
4053                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
4054                         ctdb_set_culprit(rec, ctdb->pnn);
4055                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4056                         return;
4057                 }
4058
4059                 /* If takeover run fails, then the offending nodes are
4060                  * assigned ban culprit counts. And we re-try takeover.
4061                  * If takeover run fails repeatedly, the node would get
4062                  * banned.
4063                  *
4064                  * If rec->need_takeover_run is not set to true at this
4065                  * failure, monitoring is disabled cluster-wide (via
4066                  * startrecovery eventscript) and will not get enabled.
4067                  */
4068                 if (!do_takeover_run(rec, nodemap, true)) {
4069                         return;
4070                 }
4071
4072                 /* execute the "recovered" event script on all nodes */
4073                 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
4074 #if 0
4075 // we cant check whether the event completed successfully
4076 // since this script WILL fail if the node is in recovery mode
4077 // and if that race happens, the code here would just cause a second
4078 // cascading recovery.
4079                 if (ret!=0) {
4080                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
4081                         ctdb_set_culprit(rec, ctdb->pnn);
4082                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4083                 }
4084 #endif
4085         }
4086 }
4087
4088 /*
4089   the main monitoring loop
4090  */
4091 static void monitor_cluster(struct ctdb_context *ctdb)
4092 {
4093         struct ctdb_recoverd *rec;
4094
4095         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
4096
4097         rec = talloc_zero(ctdb, struct ctdb_recoverd);
4098         CTDB_NO_MEMORY_FATAL(ctdb, rec);
4099
4100         rec->ctdb = ctdb;
4101
4102         rec->takeover_run_in_progress = false;
4103
4104         rec->priority_time = timeval_current();
4105
4106         /* register a message port for sending memory dumps */
4107         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4108
4109         /* register a message port for requesting logs */
4110         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
4111
4112         /* register a message port for clearing logs */
4113         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
4114
4115         /* register a message port for recovery elections */
4116         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4117
4118         /* when nodes are disabled/enabled */
4119         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4120
4121         /* when we are asked to puch out a flag change */
4122         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4123
4124         /* register a message port for vacuum fetch */
4125         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4126
4127         /* register a message port for reloadnodes  */
4128         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4129
4130         /* register a message port for performing a takeover run */
4131         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4132
4133         /* register a message port for disabling the ip check for a short while */
4134         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4135
4136         /* register a message port for updating the recovery daemons node assignment for an ip */
4137         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4138
4139         /* register a message port for forcing a rebalance of a node next
4140            reallocation */
4141         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4142
4143         /* Register a message port for disabling takeover runs */
4144         ctdb_client_set_message_handler(ctdb,
4145                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4146                                         disable_takeover_runs_handler, rec);
4147
4148         for (;;) {
4149                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4150                 struct timeval start;
4151                 double elapsed;
4152
4153                 if (!mem_ctx) {
4154                         DEBUG(DEBUG_CRIT,(__location__
4155                                           " Failed to create temp context\n"));
4156                         exit(-1);
4157                 }
4158
4159                 start = timeval_current();
4160                 main_loop(ctdb, rec, mem_ctx);
4161                 talloc_free(mem_ctx);
4162
4163                 /* we only check for recovery once every second */
4164                 elapsed = timeval_elapsed(&start);
4165                 if (elapsed < ctdb->tunable.recover_interval) {
4166                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4167                                           - elapsed);
4168                 }
4169         }
4170 }
4171
4172 /*
4173   event handler for when the main ctdbd dies
4174  */
4175 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4176                                  uint16_t flags, void *private_data)
4177 {
4178         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4179         _exit(1);
4180 }
4181
4182 /*
4183   called regularly to verify that the recovery daemon is still running
4184  */
4185 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4186                               struct timeval yt, void *p)
4187 {
4188         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4189
4190         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4191                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4192
4193                 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4194                                 ctdb_restart_recd, ctdb);
4195
4196                 return;
4197         }
4198
4199         event_add_timed(ctdb->ev, ctdb->recd_ctx,
4200                         timeval_current_ofs(30, 0),
4201                         ctdb_check_recd, ctdb);
4202 }
4203
4204 static void recd_sig_child_handler(struct event_context *ev,
4205         struct signal_event *se, int signum, int count,
4206         void *dont_care,
4207         void *private_data)
4208 {
4209 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4210         int status;
4211         pid_t pid = -1;
4212
4213         while (pid != 0) {
4214                 pid = waitpid(-1, &status, WNOHANG);
4215                 if (pid == -1) {
4216                         if (errno != ECHILD) {
4217                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4218                         }
4219                         return;
4220                 }
4221                 if (pid > 0) {
4222                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4223                 }
4224         }
4225 }
4226
4227 /*
4228   startup the recovery daemon as a child of the main ctdb daemon
4229  */
4230 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4231 {
4232         int fd[2];
4233         struct signal_event *se;
4234         struct tevent_fd *fde;
4235
4236         if (pipe(fd) != 0) {
4237                 return -1;
4238         }
4239
4240         ctdb->ctdbd_pid = getpid();
4241
4242         ctdb->recoverd_pid = ctdb_fork_no_free_ringbuffer(ctdb);
4243         if (ctdb->recoverd_pid == -1) {
4244                 return -1;
4245         }
4246
4247         if (ctdb->recoverd_pid != 0) {
4248                 talloc_free(ctdb->recd_ctx);
4249                 ctdb->recd_ctx = talloc_new(ctdb);
4250                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4251
4252                 close(fd[0]);
4253                 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4254                                 timeval_current_ofs(30, 0),
4255                                 ctdb_check_recd, ctdb);
4256                 return 0;
4257         }
4258
4259         close(fd[1]);
4260
4261         srandom(getpid() ^ time(NULL));
4262
4263         /* Clear the log ringbuffer */
4264         ctdb_clear_log(ctdb);
4265
4266         ctdb_set_process_name("ctdb_recovered");
4267         if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4268                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4269                 exit(1);
4270         }
4271
4272         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4273
4274         fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4275                      ctdb_recoverd_parent, &fd[0]);
4276         tevent_fd_set_auto_close(fde);
4277
4278         /* set up a handler to pick up sigchld */
4279         se = event_add_signal(ctdb->ev, ctdb,
4280                                      SIGCHLD, 0,
4281                                      recd_sig_child_handler,
4282                                      ctdb);
4283         if (se == NULL) {
4284                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4285                 exit(1);
4286         }
4287
4288         monitor_cluster(ctdb);
4289
4290         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4291         return -1;
4292 }
4293
4294 /*
4295   shutdown the recovery daemon
4296  */
4297 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4298 {
4299         if (ctdb->recoverd_pid == 0) {
4300                 return;
4301         }
4302
4303         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4304         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4305
4306         TALLOC_FREE(ctdb->recd_ctx);
4307         TALLOC_FREE(ctdb->recd_ping_count);
4308 }
4309
4310 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4311                        struct timeval t, void *private_data)
4312 {
4313         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4314
4315         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4316         ctdb_stop_recoverd(ctdb);
4317         ctdb_start_recoverd(ctdb);
4318 }