ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "includes.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25 #include "popt.h"
  26 #include "cmdline.h"
  27 #include "../include/ctdb_client.h"
  28 #include "../include/ctdb_private.h"
  29 #include "db_wrap.h"
  30 #include "dlinklist.h"
  31
  32
  33 /* List of SRVID requests that need to be processed */
  34 struct srvid_list {
  35         struct srvid_list *next, *prev;
  36         struct srvid_request *request;
  37 };
  38
  39 struct srvid_requests {
  40         struct srvid_list *requests;
  41 };
  42
  43 static void srvid_request_reply(struct ctdb_context *ctdb,
  44                                 struct srvid_request *request,
  45                                 TDB_DATA result)
  46 {
  47         /* Someone that sent srvid==0 does not want a reply */
  48         if (request->srvid == 0) {
  49                 talloc_free(request);
  50                 return;
  51         }
  52
  53         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  54                                      result) == 0) {
  55                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  56                                   (unsigned)request->pnn,
  57                                   (unsigned long long)request->srvid));
  58         } else {
  59                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  60                                  (unsigned)request->pnn,
  61                                  (unsigned long long)request->srvid));
  62         }
  63
  64         talloc_free(request);
  65 }
  66
  67 static void srvid_requests_reply(struct ctdb_context *ctdb,
  68                                  struct srvid_requests **requests,
  69                                  TDB_DATA result)
  70 {
  71         struct srvid_list *r;
  72
  73         for (r = (*requests)->requests; r != NULL; r = r->next) {
  74                 srvid_request_reply(ctdb, r->request, result);
  75         }
  76
  77         /* Free the list structure... */
  78         TALLOC_FREE(*requests);
  79 }
  80
  81 static void srvid_request_add(struct ctdb_context *ctdb,
  82                               struct srvid_requests **requests,
  83                               struct srvid_request *request)
  84 {
  85         struct srvid_list *t;
  86         int32_t ret;
  87         TDB_DATA result;
  88
  89         if (*requests == NULL) {
  90                 *requests = talloc_zero(ctdb, struct srvid_requests);
  91                 if (*requests == NULL) {
  92                         goto nomem;
  93                 }
  94         }
  95
  96         t = talloc_zero(*requests, struct srvid_list);
  97         if (t == NULL) {
  98                 /* If *requests was just allocated above then free it */
  99                 if ((*requests)->requests == NULL) {
 100                         TALLOC_FREE(*requests);
 101                 }
 102                 goto nomem;
 103         }
 104
 105         t->request = (struct srvid_request *)talloc_steal(t, request);
 106         DLIST_ADD((*requests)->requests, t);
 107
 108         return;
 109
 110 nomem:
 111         /* Failed to add the request to the list.  Send a fail. */
 112         DEBUG(DEBUG_ERR, (__location__
 113                           " Out of memory, failed to queue SRVID request\n"));
 114         ret = -ENOMEM;
 115         result.dsize = sizeof(ret);
 116         result.dptr = (uint8_t *)&ret;
 117         srvid_request_reply(ctdb, request, result);
 118 }
 119
 120 struct ctdb_banning_state {
 121         uint32_t count;
 122         struct timeval last_reported_time;
 123 };
 124
 125 /*
 126   private state of recovery daemon
 127  */
 128 struct ctdb_recoverd {
 129         struct ctdb_context *ctdb;
 130         uint32_t recmaster;
 131         uint32_t num_active;
 132         uint32_t num_lmasters;
 133         uint32_t num_connected;
 134         uint32_t last_culprit_node;
 135         struct ctdb_node_map *nodemap;
 136         struct timeval priority_time;
 137         bool need_takeover_run;
 138         bool need_recovery;
 139         uint32_t node_flags;
 140         struct timed_event *send_election_te;
 141         struct timed_event *election_timeout;
 142         struct vacuum_info *vacuum_info;
 143         struct srvid_requests *reallocate_requests;
 144         bool takeover_run_in_progress;
 145         TALLOC_CTX *takeover_runs_disable_ctx;
 146         struct ctdb_control_get_ifaces *ifaces;
 147         uint32_t *force_rebalance_nodes;
 148 };
 149
 150 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 151 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 152
 153 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
 154
 155 /*
 156   ban a node for a period of time
 157  */
 158 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 159 {
 160         int ret;
 161         struct ctdb_context *ctdb = rec->ctdb;
 162         struct ctdb_ban_time bantime;
 163
 164         if (!ctdb_validate_pnn(ctdb, pnn)) {
 165                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 166                 return;
 167         }
 168
 169         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 170
 171         bantime.pnn  = pnn;
 172         bantime.time = ban_time;
 173
 174         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 175         if (ret != 0) {
 176                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 177                 return;
 178         }
 179
 180 }
 181
 182 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 183
 184
 185 /*
 186   remember the trouble maker
 187  */
 188 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 189 {
 190         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 191         struct ctdb_banning_state *ban_state;
 192
 193         if (culprit > ctdb->num_nodes) {
 194                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 195                 return;
 196         }
 197
 198         /* If we are banned or stopped, do not set other nodes as culprits */
 199         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 200                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 201                 return;
 202         }
 203
 204         if (ctdb->nodes[culprit]->ban_state == NULL) {
 205                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 206                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 207
 208
 209         }
 210         ban_state = ctdb->nodes[culprit]->ban_state;
 211         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 212                 /* this was the first time in a long while this node
 213                    misbehaved so we will forgive any old transgressions.
 214                 */
 215                 ban_state->count = 0;
 216         }
 217
 218         ban_state->count += count;
 219         ban_state->last_reported_time = timeval_current();
 220         rec->last_culprit_node = culprit;
 221 }
 222
 223 /*
 224   remember the trouble maker
 225  */
 226 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 227 {
 228         ctdb_set_culprit_count(rec, culprit, 1);
 229 }
 230
 231
 232 /* this callback is called for every node that failed to execute the
 233    recovered event
 234 */
 235 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 236 {
 237         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 238
 239         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
 240
 241         ctdb_set_culprit(rec, node_pnn);
 242 }
 243
 244 /*
 245   run the "recovered" eventscript on all nodes
 246  */
 247 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
 248 {
 249         TALLOC_CTX *tmp_ctx;
 250         uint32_t *nodes;
 251         struct ctdb_context *ctdb = rec->ctdb;
 252
 253         tmp_ctx = talloc_new(ctdb);
 254         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 255
 256         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 257         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
 258                                         nodes, 0,
 259                                         CONTROL_TIMEOUT(), false, tdb_null,
 260                                         NULL, recovered_fail_callback,
 261                                         rec) != 0) {
 262                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
 263
 264                 talloc_free(tmp_ctx);
 265                 return -1;
 266         }
 267
 268         talloc_free(tmp_ctx);
 269         return 0;
 270 }
 271
 272 /* this callback is called for every node that failed to execute the
 273    start recovery event
 274 */
 275 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 276 {
 277         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 278
 279         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
 280
 281         ctdb_set_culprit(rec, node_pnn);
 282 }
 283
 284 /*
 285   run the "startrecovery" eventscript on all nodes
 286  */
 287 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
 288 {
 289         TALLOC_CTX *tmp_ctx;
 290         uint32_t *nodes;
 291         struct ctdb_context *ctdb = rec->ctdb;
 292
 293         tmp_ctx = talloc_new(ctdb);
 294         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 295
 296         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 297         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
 298                                         nodes, 0,
 299                                         CONTROL_TIMEOUT(), false, tdb_null,
 300                                         NULL,
 301                                         startrecovery_fail_callback,
 302                                         rec) != 0) {
 303                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
 304                 talloc_free(tmp_ctx);
 305                 return -1;
 306         }
 307
 308         talloc_free(tmp_ctx);
 309         return 0;
 310 }
 311
 312 static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 313 {
 314         if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
 315                 DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
 316                 return;
 317         }
 318         if (node_pnn < ctdb->num_nodes) {
 319                 ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
 320         }
 321
 322         if (node_pnn == ctdb->pnn) {
 323                 ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
 324         }
 325 }
 326
 327 /*
 328   update the node capabilities for all connected nodes
 329  */
 330 static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 331 {
 332         uint32_t *nodes;
 333         TALLOC_CTX *tmp_ctx;
 334
 335         tmp_ctx = talloc_new(ctdb);
 336         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 337
 338         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
 339         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
 340                                         nodes, 0,
 341                                         CONTROL_TIMEOUT(),
 342                                         false, tdb_null,
 343                                         async_getcap_callback, NULL,
 344                                         NULL) != 0) {
 345                 DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
 346                 talloc_free(tmp_ctx);
 347                 return -1;
 348         }
 349
 350         talloc_free(tmp_ctx);
 351         return 0;
 352 }
 353
 354 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 355 {
 356         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 357
 358         DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 359         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 360 }
 361
 362 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 363 {
 364         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 365
 366         DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 367         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 368 }
 369
 370 /*
 371   change recovery mode on all nodes
 372  */
 373 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
 374 {
 375         TDB_DATA data;
 376         uint32_t *nodes;
 377         TALLOC_CTX *tmp_ctx;
 378
 379         tmp_ctx = talloc_new(ctdb);
 380         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 381
 382         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 383
 384         data.dsize = sizeof(uint32_t);
 385         data.dptr = (unsigned char *)&rec_mode;
 386
 387         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 388                                         nodes, 0,
 389                                         CONTROL_TIMEOUT(),
 390                                         false, data,
 391                                         NULL, NULL,
 392                                         NULL) != 0) {
 393                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 394                 talloc_free(tmp_ctx);
 395                 return -1;
 396         }
 397
 398         /* freeze all nodes */
 399         if (rec_mode == CTDB_RECOVERY_ACTIVE) {
 400                 int i;
 401
 402                 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
 403                         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
 404                                                 nodes, i,
 405                                                 CONTROL_TIMEOUT(),
 406                                                 false, tdb_null,
 407                                                 NULL,
 408                                                 set_recmode_fail_callback,
 409                                                 rec) != 0) {
 410                                 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
 411                                 talloc_free(tmp_ctx);
 412                                 return -1;
 413                         }
 414                 }
 415         }
 416
 417         talloc_free(tmp_ctx);
 418         return 0;
 419 }
 420
 421 /*
 422   change recovery master on all node
 423  */
 424 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
 425 {
 426         TDB_DATA data;
 427         TALLOC_CTX *tmp_ctx;
 428         uint32_t *nodes;
 429
 430         tmp_ctx = talloc_new(ctdb);
 431         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 432
 433         data.dsize = sizeof(uint32_t);
 434         data.dptr = (unsigned char *)&pnn;
 435
 436         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 437         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
 438                                         nodes, 0,
 439                                         CONTROL_TIMEOUT(), false, data,
 440                                         NULL, NULL,
 441                                         NULL) != 0) {
 442                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
 443                 talloc_free(tmp_ctx);
 444                 return -1;
 445         }
 446
 447         talloc_free(tmp_ctx);
 448         return 0;
 449 }
 450
 451 /* update all remote nodes to use the same db priority that we have
 452    this can fail if the remove node has not yet been upgraded to
 453    support this function, so we always return success and never fail
 454    a recovery if this call fails.
 455 */
 456 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
 457         struct ctdb_node_map *nodemap,
 458         uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 459 {
 460         int db;
 461
 462         /* step through all local databases */
 463         for (db=0; db<dbmap->num;db++) {
 464                 struct ctdb_db_priority db_prio;
 465                 int ret;
 466
 467                 db_prio.db_id     = dbmap->dbs[db].dbid;
 468                 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
 469                 if (ret != 0) {
 470                         DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
 471                         continue;
 472                 }
 473
 474                 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
 475
 476                 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
 477                                                 CTDB_CURRENT_NODE, &db_prio);
 478                 if (ret != 0) {
 479                         DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
 480                                          db_prio.db_id));
 481                 }
 482         }
 483
 484         return 0;
 485 }
 486
 487 /*
 488   ensure all other nodes have attached to any databases that we have
 489  */
 490 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 491                                            uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 492 {
 493         int i, j, db, ret;
 494         struct ctdb_dbid_map *remote_dbmap;
 495
 496         /* verify that all other nodes have all our databases */
 497         for (j=0; j<nodemap->num; j++) {
 498                 /* we dont need to ourself ourselves */
 499                 if (nodemap->nodes[j].pnn == pnn) {
 500                         continue;
 501                 }
 502                 /* dont check nodes that are unavailable */
 503                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 504                         continue;
 505                 }
 506
 507                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 508                                          mem_ctx, &remote_dbmap);
 509                 if (ret != 0) {
 510                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 511                         return -1;
 512                 }
 513
 514                 /* step through all local databases */
 515                 for (db=0; db<dbmap->num;db++) {
 516                         const char *name;
 517
 518
 519                         for (i=0;i<remote_dbmap->num;i++) {
 520                                 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
 521                                         break;
 522                                 }
 523                         }
 524                         /* the remote node already have this database */
 525                         if (i!=remote_dbmap->num) {
 526                                 continue;
 527                         }
 528                         /* ok so we need to create this database */
 529                         ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
 530                                                   dbmap->dbs[db].dbid, mem_ctx,
 531                                                   &name);
 532                         if (ret != 0) {
 533                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 534                                 return -1;
 535                         }
 536                         ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
 537                                                  nodemap->nodes[j].pnn,
 538                                                  mem_ctx, name,
 539                                                  dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 540                         if (ret != 0) {
 541                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 542                                 return -1;
 543                         }
 544                 }
 545         }
 546
 547         return 0;
 548 }
 549
 550
 551 /*
 552   ensure we are attached to any databases that anyone else is attached to
 553  */
 554 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 555                                           uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
 556 {
 557         int i, j, db, ret;
 558         struct ctdb_dbid_map *remote_dbmap;
 559
 560         /* verify that we have all database any other node has */
 561         for (j=0; j<nodemap->num; j++) {
 562                 /* we dont need to ourself ourselves */
 563                 if (nodemap->nodes[j].pnn == pnn) {
 564                         continue;
 565                 }
 566                 /* dont check nodes that are unavailable */
 567                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 568                         continue;
 569                 }
 570
 571                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 572                                          mem_ctx, &remote_dbmap);
 573                 if (ret != 0) {
 574                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 575                         return -1;
 576                 }
 577
 578                 /* step through all databases on the remote node */
 579                 for (db=0; db<remote_dbmap->num;db++) {
 580                         const char *name;
 581
 582                         for (i=0;i<(*dbmap)->num;i++) {
 583                                 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
 584                                         break;
 585                                 }
 586                         }
 587                         /* we already have this db locally */
 588                         if (i!=(*dbmap)->num) {
 589                                 continue;
 590                         }
 591                         /* ok so we need to create this database and
 592                            rebuild dbmap
 593                          */
 594                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 595                                             remote_dbmap->dbs[db].dbid, mem_ctx, &name);
 596                         if (ret != 0) {
 597                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 598                                           nodemap->nodes[j].pnn));
 599                                 return -1;
 600                         }
 601                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
 602                                            remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 603                         if (ret != 0) {
 604                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 605                                 return -1;
 606                         }
 607                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 608                         if (ret != 0) {
 609                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 610                                 return -1;
 611                         }
 612                 }
 613         }
 614
 615         return 0;
 616 }
 617
 618
 619 /*
 620   pull the remote database contents from one node into the recdb
 621  */
 622 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 623                                     struct tdb_wrap *recdb, uint32_t dbid)
 624 {
 625         int ret;
 626         TDB_DATA outdata;
 627         struct ctdb_marshall_buffer *reply;
 628         struct ctdb_rec_data *rec;
 629         int i;
 630         TALLOC_CTX *tmp_ctx = talloc_new(recdb);
 631
 632         ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
 633                                CONTROL_TIMEOUT(), &outdata);
 634         if (ret != 0) {
 635                 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
 636                 talloc_free(tmp_ctx);
 637                 return -1;
 638         }
 639
 640         reply = (struct ctdb_marshall_buffer *)outdata.dptr;
 641
 642         if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
 643                 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
 644                 talloc_free(tmp_ctx);
 645                 return -1;
 646         }
 647
 648         rec = (struct ctdb_rec_data *)&reply->data[0];
 649
 650         for (i=0;
 651              i<reply->count;
 652              rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
 653                 TDB_DATA key, data;
 654                 struct ctdb_ltdb_header *hdr;
 655                 TDB_DATA existing;
 656
 657                 key.dptr = &rec->data[0];
 658                 key.dsize = rec->keylen;
 659                 data.dptr = &rec->data[key.dsize];
 660                 data.dsize = rec->datalen;
 661
 662                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 663
 664                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 665                         DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
 666                         talloc_free(tmp_ctx);
 667                         return -1;
 668                 }
 669
 670                 /* fetch the existing record, if any */
 671                 existing = tdb_fetch(recdb->tdb, key);
 672
 673                 if (existing.dptr != NULL) {
 674                         struct ctdb_ltdb_header header;
 675                         if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
 676                                 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
 677                                          (unsigned)existing.dsize, srcnode));
 678                                 free(existing.dptr);
 679                                 talloc_free(tmp_ctx);
 680                                 return -1;
 681                         }
 682                         header = *(struct ctdb_ltdb_header *)existing.dptr;
 683                         free(existing.dptr);
 684                         if (!(header.rsn < hdr->rsn ||
 685                               (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
 686                                 continue;
 687                         }
 688                 }
 689
 690                 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
 691                         DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
 692                         talloc_free(tmp_ctx);
 693                         return -1;
 694                 }
 695         }
 696
 697         talloc_free(tmp_ctx);
 698
 699         return 0;
 700 }
 701
 702
 703 struct pull_seqnum_cbdata {
 704         int failed;
 705         uint32_t pnn;
 706         uint64_t seqnum;
 707 };
 708
 709 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 710 {
 711         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 712         uint64_t seqnum;
 713
 714         if (cb_data->failed != 0) {
 715                 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
 716                 return;
 717         }
 718
 719         if (res != 0) {
 720                 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
 721                 cb_data->failed = 1;
 722                 return;
 723         }
 724
 725         if (outdata.dsize != sizeof(uint64_t)) {
 726                 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
 727                 cb_data->failed = -1;
 728                 return;
 729         }
 730
 731         seqnum = *((uint64_t *)outdata.dptr);
 732
 733         if (seqnum > cb_data->seqnum ||
 734             (cb_data->pnn == -1 && seqnum == 0)) {
 735                 cb_data->seqnum = seqnum;
 736                 cb_data->pnn = node_pnn;
 737         }
 738 }
 739
 740 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 741 {
 742         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 743
 744         DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
 745         cb_data->failed = 1;
 746 }
 747
 748 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
 749                                 struct ctdb_recoverd *rec,
 750                                 struct ctdb_node_map *nodemap,
 751                                 struct tdb_wrap *recdb, uint32_t dbid)
 752 {
 753         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
 754         uint32_t *nodes;
 755         TDB_DATA data;
 756         uint32_t outdata[2];
 757         struct pull_seqnum_cbdata *cb_data;
 758
 759         DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
 760
 761         outdata[0] = dbid;
 762         outdata[1] = 0;
 763
 764         data.dsize = sizeof(outdata);
 765         data.dptr  = (uint8_t *)&outdata[0];
 766
 767         cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
 768         if (cb_data == NULL) {
 769                 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
 770                 talloc_free(tmp_ctx);
 771                 return -1;
 772         }
 773
 774         cb_data->failed = 0;
 775         cb_data->pnn    = -1;
 776         cb_data->seqnum = 0;
 777
 778         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 779         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
 780                                         nodes, 0,
 781                                         CONTROL_TIMEOUT(), false, data,
 782                                         pull_seqnum_cb,
 783                                         pull_seqnum_fail_cb,
 784                                         cb_data) != 0) {
 785                 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
 786
 787                 talloc_free(tmp_ctx);
 788                 return -1;
 789         }
 790
 791         if (cb_data->failed != 0) {
 792                 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
 793                 talloc_free(tmp_ctx);
 794                 return -1;
 795         }
 796
 797         if (cb_data->pnn == -1) {
 798                 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
 799                 talloc_free(tmp_ctx);
 800                 return -1;
 801         }
 802
 803         DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
 804
 805         if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
 806                 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
 807                 talloc_free(tmp_ctx);
 808                 return -1;
 809         }
 810
 811         talloc_free(tmp_ctx);
 812         return 0;
 813 }
 814
 815
 816 /*
 817   pull all the remote database contents into the recdb
 818  */
 819 static int pull_remote_database(struct ctdb_context *ctdb,
 820                                 struct ctdb_recoverd *rec,
 821                                 struct ctdb_node_map *nodemap,
 822                                 struct tdb_wrap *recdb, uint32_t dbid,
 823                                 bool persistent)
 824 {
 825         int j;
 826
 827         if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
 828                 int ret;
 829                 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
 830                 if (ret == 0) {
 831                         return 0;
 832                 }
 833         }
 834
 835         /* pull all records from all other nodes across onto this node
 836            (this merges based on rsn)
 837         */
 838         for (j=0; j<nodemap->num; j++) {
 839                 /* dont merge from nodes that are unavailable */
 840                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 841                         continue;
 842                 }
 843                 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 844                         DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
 845                                  nodemap->nodes[j].pnn));
 846                         ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 847                         return -1;
 848                 }
 849         }
 850
 851         return 0;
 852 }
 853
 854
 855 /*
 856   update flags on all active nodes
 857  */
 858 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
 859 {
 860         int ret;
 861
 862         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 863                 if (ret != 0) {
 864                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 865                 return -1;
 866         }
 867
 868         return 0;
 869 }
 870
 871 /*
 872   ensure all nodes have the same vnnmap we do
 873  */
 874 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 875                                       uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
 876 {
 877         int j, ret;
 878
 879         /* push the new vnn map out to all the nodes */
 880         for (j=0; j<nodemap->num; j++) {
 881                 /* dont push to nodes that are unavailable */
 882                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 883                         continue;
 884                 }
 885
 886                 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
 887                 if (ret != 0) {
 888                         DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
 889                         return -1;
 890                 }
 891         }
 892
 893         return 0;
 894 }
 895
 896
 897 struct vacuum_info {
 898         struct vacuum_info *next, *prev;
 899         struct ctdb_recoverd *rec;
 900         uint32_t srcnode;
 901         struct ctdb_db_context *ctdb_db;
 902         struct ctdb_marshall_buffer *recs;
 903         struct ctdb_rec_data *r;
 904 };
 905
 906 static void vacuum_fetch_next(struct vacuum_info *v);
 907
 908 /*
 909   called when a vacuum fetch has completed - just free it and do the next one
 910  */
 911 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 912 {
 913         struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
 914         talloc_free(state);
 915         vacuum_fetch_next(v);
 916 }
 917
 918
 919 /*
 920   process the next element from the vacuum list
 921 */
 922 static void vacuum_fetch_next(struct vacuum_info *v)
 923 {
 924         struct ctdb_call call;
 925         struct ctdb_rec_data *r;
 926
 927         while (v->recs->count) {
 928                 struct ctdb_client_call_state *state;
 929                 TDB_DATA data;
 930                 struct ctdb_ltdb_header *hdr;
 931
 932                 ZERO_STRUCT(call);
 933                 call.call_id = CTDB_NULL_FUNC;
 934                 call.flags = CTDB_IMMEDIATE_MIGRATION;
 935                 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
 936
 937                 r = v->r;
 938                 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
 939                 v->recs->count--;
 940
 941                 call.key.dptr = &r->data[0];
 942                 call.key.dsize = r->keylen;
 943
 944                 /* ensure we don't block this daemon - just skip a record if we can't get
 945                    the chainlock */
 946                 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
 947                         continue;
 948                 }
 949
 950                 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
 951                 if (data.dptr == NULL) {
 952                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 953                         continue;
 954                 }
 955
 956                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 957                         free(data.dptr);
 958                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 959                         continue;
 960                 }
 961
 962                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 963                 if (hdr->dmaster == v->rec->ctdb->pnn) {
 964                         /* its already local */
 965                         free(data.dptr);
 966                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 967                         continue;
 968                 }
 969
 970                 free(data.dptr);
 971
 972                 state = ctdb_call_send(v->ctdb_db, &call);
 973                 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
 974                 if (state == NULL) {
 975                         DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
 976                         talloc_free(v);
 977                         return;
 978                 }
 979                 state->async.fn = vacuum_fetch_callback;
 980                 state->async.private_data = v;
 981                 return;
 982         }
 983
 984         talloc_free(v);
 985 }
 986
 987
 988 /*
 989   destroy a vacuum info structure
 990  */
 991 static int vacuum_info_destructor(struct vacuum_info *v)
 992 {
 993         DLIST_REMOVE(v->rec->vacuum_info, v);
 994         return 0;
 995 }
 996
 997
 998 /*
 999   handler for vacuum fetch
1000 */
1001 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1002                                  TDB_DATA data, void *private_data)
1003 {
1004         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1005         struct ctdb_marshall_buffer *recs;
1006         int ret, i;
1007         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1008         const char *name;
1009         struct ctdb_dbid_map *dbmap=NULL;
1010         bool persistent = false;
1011         struct ctdb_db_context *ctdb_db;
1012         struct ctdb_rec_data *r;
1013         uint32_t srcnode;
1014         struct vacuum_info *v;
1015
1016         recs = (struct ctdb_marshall_buffer *)data.dptr;
1017         r = (struct ctdb_rec_data *)&recs->data[0];
1018
1019         if (recs->count == 0) {
1020                 talloc_free(tmp_ctx);
1021                 return;
1022         }
1023
1024         srcnode = r->reqid;
1025
1026         for (v=rec->vacuum_info;v;v=v->next) {
1027                 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1028                         /* we're already working on records from this node */
1029                         talloc_free(tmp_ctx);
1030                         return;
1031                 }
1032         }
1033
1034         /* work out if the database is persistent */
1035         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1036         if (ret != 0) {
1037                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1038                 talloc_free(tmp_ctx);
1039                 return;
1040         }
1041
1042         for (i=0;i<dbmap->num;i++) {
1043                 if (dbmap->dbs[i].dbid == recs->db_id) {
1044                         persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1045                         break;
1046                 }
1047         }
1048         if (i == dbmap->num) {
1049                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1050                 talloc_free(tmp_ctx);
1051                 return;
1052         }
1053
1054         /* find the name of this database */
1055         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1056                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1057                 talloc_free(tmp_ctx);
1058                 return;
1059         }
1060
1061         /* attach to it */
1062         ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1063         if (ctdb_db == NULL) {
1064                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1065                 talloc_free(tmp_ctx);
1066                 return;
1067         }
1068
1069         v = talloc_zero(rec, struct vacuum_info);
1070         if (v == NULL) {
1071                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1072                 talloc_free(tmp_ctx);
1073                 return;
1074         }
1075
1076         v->rec = rec;
1077         v->srcnode = srcnode;
1078         v->ctdb_db = ctdb_db;
1079         v->recs = talloc_memdup(v, recs, data.dsize);
1080         if (v->recs == NULL) {
1081                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1082                 talloc_free(v);
1083                 talloc_free(tmp_ctx);
1084                 return;
1085         }
1086         v->r =  (struct ctdb_rec_data *)&v->recs->data[0];
1087
1088         DLIST_ADD(rec->vacuum_info, v);
1089
1090         talloc_set_destructor(v, vacuum_info_destructor);
1091
1092         vacuum_fetch_next(v);
1093         talloc_free(tmp_ctx);
1094 }
1095
1096
1097 /*
1098  * handler for database detach
1099  */
1100 static void detach_database_handler(struct ctdb_context *ctdb, uint64_t srvid,
1101                                     TDB_DATA data, void *private_data)
1102 {
1103         struct ctdb_recoverd *rec = talloc_get_type(private_data,
1104                                                     struct ctdb_recoverd);
1105         uint32_t db_id;
1106         struct vacuum_info *v, *vnext;
1107         struct ctdb_db_context *ctdb_db;
1108
1109         if (data.dsize != sizeof(db_id)) {
1110                 return;
1111         }
1112         db_id = *(uint32_t *)data.dptr;
1113
1114         ctdb_db = find_ctdb_db(ctdb, db_id);
1115         if (ctdb_db == NULL) {
1116                 /* database is not attached */
1117                 return;
1118         }
1119
1120         /* Stop any active vacuum fetch */
1121         v = rec->vacuum_info;
1122         while (v != NULL) {
1123                 vnext = v->next;
1124
1125                 if (v->ctdb_db->db_id == db_id) {
1126                         talloc_free(v);
1127                 }
1128                 v = vnext;
1129         }
1130
1131         DLIST_REMOVE(ctdb->db_list, ctdb_db);
1132
1133         DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1134                              ctdb_db->db_name));
1135         talloc_free(ctdb_db);
1136 }
1137
1138 /*
1139   called when ctdb_wait_timeout should finish
1140  */
1141 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1142                               struct timeval yt, void *p)
1143 {
1144         uint32_t *timed_out = (uint32_t *)p;
1145         (*timed_out) = 1;
1146 }
1147
1148 /*
1149   wait for a given number of seconds
1150  */
1151 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1152 {
1153         uint32_t timed_out = 0;
1154         time_t usecs = (secs - (time_t)secs) * 1000000;
1155         event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1156         while (!timed_out) {
1157                 event_loop_once(ctdb->ev);
1158         }
1159 }
1160
1161 /*
1162   called when an election times out (ends)
1163  */
1164 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1165                                   struct timeval t, void *p)
1166 {
1167         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1168         rec->election_timeout = NULL;
1169         fast_start = false;
1170
1171         DEBUG(DEBUG_WARNING,("Election period ended\n"));
1172 }
1173
1174
1175 /*
1176   wait for an election to finish. It finished election_timeout seconds after
1177   the last election packet is received
1178  */
1179 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1180 {
1181         struct ctdb_context *ctdb = rec->ctdb;
1182         while (rec->election_timeout) {
1183                 event_loop_once(ctdb->ev);
1184         }
1185 }
1186
1187 /*
1188   Update our local flags from all remote connected nodes.
1189   This is only run when we are or we belive we are the recovery master
1190  */
1191 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1192 {
1193         int j;
1194         struct ctdb_context *ctdb = rec->ctdb;
1195         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1196
1197         /* get the nodemap for all active remote nodes and verify
1198            they are the same as for this node
1199          */
1200         for (j=0; j<nodemap->num; j++) {
1201                 struct ctdb_node_map *remote_nodemap=NULL;
1202                 int ret;
1203
1204                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1205                         continue;
1206                 }
1207                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1208                         continue;
1209                 }
1210
1211                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1212                                            mem_ctx, &remote_nodemap);
1213                 if (ret != 0) {
1214                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1215                                   nodemap->nodes[j].pnn));
1216                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1217                         talloc_free(mem_ctx);
1218                         return MONITOR_FAILED;
1219                 }
1220                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1221                         /* We should tell our daemon about this so it
1222                            updates its flags or else we will log the same
1223                            message again in the next iteration of recovery.
1224                            Since we are the recovery master we can just as
1225                            well update the flags on all nodes.
1226                         */
1227                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1228                         if (ret != 0) {
1229                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1230                                 return -1;
1231                         }
1232
1233                         /* Update our local copy of the flags in the recovery
1234                            daemon.
1235                         */
1236                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1237                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1238                                  nodemap->nodes[j].flags));
1239                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1240                 }
1241                 talloc_free(remote_nodemap);
1242         }
1243         talloc_free(mem_ctx);
1244         return MONITOR_OK;
1245 }
1246
1247
1248 /* Create a new random generation ip.
1249    The generation id can not be the INVALID_GENERATION id
1250 */
1251 static uint32_t new_generation(void)
1252 {
1253         uint32_t generation;
1254
1255         while (1) {
1256                 generation = random();
1257
1258                 if (generation != INVALID_GENERATION) {
1259                         break;
1260                 }
1261         }
1262
1263         return generation;
1264 }
1265
1266
1267 /*
1268   create a temporary working database
1269  */
1270 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1271 {
1272         char *name;
1273         struct tdb_wrap *recdb;
1274         unsigned tdb_flags;
1275
1276         /* open up the temporary recovery database */
1277         name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1278                                ctdb->db_directory_state,
1279                                ctdb->pnn);
1280         if (name == NULL) {
1281                 return NULL;
1282         }
1283         unlink(name);
1284
1285         tdb_flags = TDB_NOLOCK;
1286         if (ctdb->valgrinding) {
1287                 tdb_flags |= TDB_NOMMAP;
1288         }
1289         tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1290
1291         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1292                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1293         if (recdb == NULL) {
1294                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1295         }
1296
1297         talloc_free(name);
1298
1299         return recdb;
1300 }
1301
1302
1303 /*
1304    a traverse function for pulling all relevant records from recdb
1305  */
1306 struct recdb_data {
1307         struct ctdb_context *ctdb;
1308         struct ctdb_marshall_buffer *recdata;
1309         uint32_t len;
1310         uint32_t allocated_len;
1311         bool failed;
1312         bool persistent;
1313 };
1314
1315 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1316 {
1317         struct recdb_data *params = (struct recdb_data *)p;
1318         struct ctdb_rec_data *rec;
1319         struct ctdb_ltdb_header *hdr;
1320
1321         /*
1322          * skip empty records - but NOT for persistent databases:
1323          *
1324          * The record-by-record mode of recovery deletes empty records.
1325          * For persistent databases, this can lead to data corruption
1326          * by deleting records that should be there:
1327          *
1328          * - Assume the cluster has been running for a while.
1329          *
1330          * - A record R in a persistent database has been created and
1331          *   deleted a couple of times, the last operation being deletion,
1332          *   leaving an empty record with a high RSN, say 10.
1333          *
1334          * - Now a node N is turned off.
1335          *
1336          * - This leaves the local database copy of D on N with the empty
1337          *   copy of R and RSN 10. On all other nodes, the recovery has deleted
1338          *   the copy of record R.
1339          *
1340          * - Now the record is created again while node N is turned off.
1341          *   This creates R with RSN = 1 on all nodes except for N.
1342          *
1343          * - Now node N is turned on again. The following recovery will chose
1344          *   the older empty copy of R due to RSN 10 > RSN 1.
1345          *
1346          * ==> Hence the record is gone after the recovery.
1347          *
1348          * On databases like Samba's registry, this can damage the higher-level
1349          * data structures built from the various tdb-level records.
1350          */
1351         if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1352                 return 0;
1353         }
1354
1355         /* update the dmaster field to point to us */
1356         hdr = (struct ctdb_ltdb_header *)data.dptr;
1357         if (!params->persistent) {
1358                 hdr->dmaster = params->ctdb->pnn;
1359                 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1360         }
1361
1362         /* add the record to the blob ready to send to the nodes */
1363         rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1364         if (rec == NULL) {
1365                 params->failed = true;
1366                 return -1;
1367         }
1368         if (params->len + rec->length >= params->allocated_len) {
1369                 params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1370                 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1371         }
1372         if (params->recdata == NULL) {
1373                 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1374                          rec->length + params->len));
1375                 params->failed = true;
1376                 return -1;
1377         }
1378         params->recdata->count++;
1379         memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
1380         params->len += rec->length;
1381         talloc_free(rec);
1382
1383         return 0;
1384 }
1385
1386 /*
1387   push the recdb database out to all nodes
1388  */
1389 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1390                                bool persistent,
1391                                struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1392 {
1393         struct recdb_data params;
1394         struct ctdb_marshall_buffer *recdata;
1395         TDB_DATA outdata;
1396         TALLOC_CTX *tmp_ctx;
1397         uint32_t *nodes;
1398
1399         tmp_ctx = talloc_new(ctdb);
1400         CTDB_NO_MEMORY(ctdb, tmp_ctx);
1401
1402         recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1403         CTDB_NO_MEMORY(ctdb, recdata);
1404
1405         recdata->db_id = dbid;
1406
1407         params.ctdb = ctdb;
1408         params.recdata = recdata;
1409         params.len = offsetof(struct ctdb_marshall_buffer, data);
1410         params.allocated_len = params.len;
1411         params.failed = false;
1412         params.persistent = persistent;
1413
1414         if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1415                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1416                 talloc_free(params.recdata);
1417                 talloc_free(tmp_ctx);
1418                 return -1;
1419         }
1420
1421         if (params.failed) {
1422                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1423                 talloc_free(params.recdata);
1424                 talloc_free(tmp_ctx);
1425                 return -1;
1426         }
1427
1428         recdata = params.recdata;
1429
1430         outdata.dptr = (void *)recdata;
1431         outdata.dsize = params.len;
1432
1433         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1434         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1435                                         nodes, 0,
1436                                         CONTROL_TIMEOUT(), false, outdata,
1437                                         NULL, NULL,
1438                                         NULL) != 0) {
1439                 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1440                 talloc_free(recdata);
1441                 talloc_free(tmp_ctx);
1442                 return -1;
1443         }
1444
1445         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1446                   dbid, recdata->count));
1447
1448         talloc_free(recdata);
1449         talloc_free(tmp_ctx);
1450
1451         return 0;
1452 }
1453
1454
1455 /*
1456   go through a full recovery on one database
1457  */
1458 static int recover_database(struct ctdb_recoverd *rec,
1459                             TALLOC_CTX *mem_ctx,
1460                             uint32_t dbid,
1461                             bool persistent,
1462                             uint32_t pnn,
1463                             struct ctdb_node_map *nodemap,
1464                             uint32_t transaction_id)
1465 {
1466         struct tdb_wrap *recdb;
1467         int ret;
1468         struct ctdb_context *ctdb = rec->ctdb;
1469         TDB_DATA data;
1470         struct ctdb_control_wipe_database w;
1471         uint32_t *nodes;
1472
1473         recdb = create_recdb(ctdb, mem_ctx);
1474         if (recdb == NULL) {
1475                 return -1;
1476         }
1477
1478         /* pull all remote databases onto the recdb */
1479         ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1480         if (ret != 0) {
1481                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1482                 return -1;
1483         }
1484
1485         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1486
1487         /* wipe all the remote databases. This is safe as we are in a transaction */
1488         w.db_id = dbid;
1489         w.transaction_id = transaction_id;
1490
1491         data.dptr = (void *)&w;
1492         data.dsize = sizeof(w);
1493
1494         nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1495         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1496                                         nodes, 0,
1497                                         CONTROL_TIMEOUT(), false, data,
1498                                         NULL, NULL,
1499                                         NULL) != 0) {
1500                 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1501                 talloc_free(recdb);
1502                 return -1;
1503         }
1504
1505         /* push out the correct database. This sets the dmaster and skips
1506            the empty records */
1507         ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1508         if (ret != 0) {
1509                 talloc_free(recdb);
1510                 return -1;
1511         }
1512
1513         /* all done with this database */
1514         talloc_free(recdb);
1515
1516         return 0;
1517 }
1518
1519 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1520                                          struct ctdb_recoverd *rec,
1521                                          struct ctdb_node_map *nodemap,
1522                                          uint32_t *culprit)
1523 {
1524         int j;
1525         int ret;
1526
1527         if (ctdb->num_nodes != nodemap->num) {
1528                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1529                                   ctdb->num_nodes, nodemap->num));
1530                 if (culprit) {
1531                         *culprit = ctdb->pnn;
1532                 }
1533                 return -1;
1534         }
1535
1536         for (j=0; j<nodemap->num; j++) {
1537                 /* For readability */
1538                 struct ctdb_node *node = ctdb->nodes[j];
1539
1540                 /* release any existing data */
1541                 if (node->known_public_ips) {
1542                         talloc_free(node->known_public_ips);
1543                         node->known_public_ips = NULL;
1544                 }
1545                 if (node->available_public_ips) {
1546                         talloc_free(node->available_public_ips);
1547                         node->available_public_ips = NULL;
1548                 }
1549
1550                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1551                         continue;
1552                 }
1553
1554                 /* Retrieve the list of known public IPs from the node */
1555                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1556                                         CONTROL_TIMEOUT(),
1557                                         node->pnn,
1558                                         ctdb->nodes,
1559                                         0,
1560                                         &node->known_public_ips);
1561                 if (ret != 0) {
1562                         DEBUG(DEBUG_ERR,
1563                               ("Failed to read known public IPs from node: %u\n",
1564                                node->pnn));
1565                         if (culprit) {
1566                                 *culprit = node->pnn;
1567                         }
1568                         return -1;
1569                 }
1570
1571                 if (ctdb->do_checkpublicip &&
1572                     rec->takeover_runs_disable_ctx == NULL &&
1573                     verify_remote_ip_allocation(ctdb,
1574                                                  node->known_public_ips,
1575                                                  node->pnn)) {
1576                         DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1577                         rec->need_takeover_run = true;
1578                 }
1579
1580                 /* Retrieve the list of available public IPs from the node */
1581                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1582                                         CONTROL_TIMEOUT(),
1583                                         node->pnn,
1584                                         ctdb->nodes,
1585                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1586                                         &node->available_public_ips);
1587                 if (ret != 0) {
1588                         DEBUG(DEBUG_ERR,
1589                               ("Failed to read available public IPs from node: %u\n",
1590                                node->pnn));
1591                         if (culprit) {
1592                                 *culprit = node->pnn;
1593                         }
1594                         return -1;
1595                 }
1596         }
1597
1598         return 0;
1599 }
1600
1601 /* when we start a recovery, make sure all nodes use the same reclock file
1602    setting
1603 */
1604 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1605 {
1606         struct ctdb_context *ctdb = rec->ctdb;
1607         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1608         TDB_DATA data;
1609         uint32_t *nodes;
1610
1611         if (ctdb->recovery_lock_file == NULL) {
1612                 data.dptr  = NULL;
1613                 data.dsize = 0;
1614         } else {
1615                 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1616                 data.dptr  = (uint8_t *)ctdb->recovery_lock_file;
1617         }
1618
1619         nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1620         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1621                                         nodes, 0,
1622                                         CONTROL_TIMEOUT(),
1623                                         false, data,
1624                                         NULL, NULL,
1625                                         rec) != 0) {
1626                 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1627                 talloc_free(tmp_ctx);
1628                 return -1;
1629         }
1630
1631         talloc_free(tmp_ctx);
1632         return 0;
1633 }
1634
1635
1636 /*
1637  * this callback is called for every node that failed to execute ctdb_takeover_run()
1638  * and set flag to re-run takeover run.
1639  */
1640 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1641 {
1642         DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1643
1644         if (callback_data != NULL) {
1645                 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1646
1647                 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1648
1649                 ctdb_set_culprit(rec, node_pnn);
1650         }
1651 }
1652
1653
1654 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1655 {
1656         struct ctdb_context *ctdb = rec->ctdb;
1657         int i;
1658         struct ctdb_banning_state *ban_state;
1659
1660         *self_ban = false;
1661         for (i=0; i<ctdb->num_nodes; i++) {
1662                 if (ctdb->nodes[i]->ban_state == NULL) {
1663                         continue;
1664                 }
1665                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1666                 if (ban_state->count < 2*ctdb->num_nodes) {
1667                         continue;
1668                 }
1669
1670                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1671                         ctdb->nodes[i]->pnn, ban_state->count,
1672                         ctdb->tunable.recovery_ban_period));
1673                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1674                 ban_state->count = 0;
1675
1676                 /* Banning ourself? */
1677                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1678                         *self_ban = true;
1679                 }
1680         }
1681 }
1682
1683 static bool do_takeover_run(struct ctdb_recoverd *rec,
1684                             struct ctdb_node_map *nodemap,
1685                             bool banning_credits_on_fail)
1686 {
1687         uint32_t *nodes = NULL;
1688         struct srvid_request_data dtr;
1689         TDB_DATA data;
1690         int i;
1691         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1692         int ret;
1693         bool ok;
1694
1695         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1696
1697         if (rec->takeover_run_in_progress) {
1698                 DEBUG(DEBUG_ERR, (__location__
1699                                   " takeover run already in progress \n"));
1700                 ok = false;
1701                 goto done;
1702         }
1703
1704         rec->takeover_run_in_progress = true;
1705
1706         /* If takeover runs are in disabled then fail... */
1707         if (rec->takeover_runs_disable_ctx != NULL) {
1708                 DEBUG(DEBUG_ERR,
1709                       ("Takeover runs are disabled so refusing to run one\n"));
1710                 ok = false;
1711                 goto done;
1712         }
1713
1714         /* Disable IP checks (takeover runs, really) on other nodes
1715          * while doing this takeover run.  This will stop those other
1716          * nodes from triggering takeover runs when think they should
1717          * be hosting an IP but it isn't yet on an interface.  Don't
1718          * wait for replies since a failure here might cause some
1719          * noise in the logs but will not actually cause a problem.
1720          */
1721         dtr.srvid = 0; /* No reply */
1722         dtr.pnn = -1;
1723
1724         data.dptr  = (uint8_t*)&dtr;
1725         data.dsize = sizeof(dtr);
1726
1727         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1728
1729         /* Disable for 60 seconds.  This can be a tunable later if
1730          * necessary.
1731          */
1732         dtr.data = 60;
1733         for (i = 0; i < talloc_array_length(nodes); i++) {
1734                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1735                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1736                                              data) != 0) {
1737                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1738                 }
1739         }
1740
1741         ret = ctdb_takeover_run(rec->ctdb, nodemap,
1742                                 rec->force_rebalance_nodes,
1743                                 takeover_fail_callback,
1744                                 banning_credits_on_fail ? rec : NULL);
1745
1746         /* Reenable takeover runs and IP checks on other nodes */
1747         dtr.data = 0;
1748         for (i = 0; i < talloc_array_length(nodes); i++) {
1749                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1750                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1751                                              data) != 0) {
1752                         DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1753                 }
1754         }
1755
1756         if (ret != 0) {
1757                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1758                 ok = false;
1759                 goto done;
1760         }
1761
1762         ok = true;
1763         /* Takeover run was successful so clear force rebalance targets */
1764         if (rebalance_nodes == rec->force_rebalance_nodes) {
1765                 TALLOC_FREE(rec->force_rebalance_nodes);
1766         } else {
1767                 DEBUG(DEBUG_WARNING,
1768                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
1769         }
1770 done:
1771         rec->need_takeover_run = !ok;
1772         talloc_free(nodes);
1773         rec->takeover_run_in_progress = false;
1774
1775         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1776         return ok;
1777 }
1778
1779
1780 /*
1781   we are the recmaster, and recovery is needed - start a recovery run
1782  */
1783 static int do_recovery(struct ctdb_recoverd *rec,
1784                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1785                        struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1786 {
1787         struct ctdb_context *ctdb = rec->ctdb;
1788         int i, j, ret;
1789         uint32_t generation;
1790         struct ctdb_dbid_map *dbmap;
1791         TDB_DATA data;
1792         uint32_t *nodes;
1793         struct timeval start_time;
1794         uint32_t culprit = (uint32_t)-1;
1795         bool self_ban;
1796
1797         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1798
1799         /* if recovery fails, force it again */
1800         rec->need_recovery = true;
1801
1802         if (rec->election_timeout) {
1803                 /* an election is in progress */
1804                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1805                 return -1;
1806         }
1807
1808         ban_misbehaving_nodes(rec, &self_ban);
1809         if (self_ban) {
1810                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1811                 return -1;
1812         }
1813
1814         if (ctdb->tunable.verify_recovery_lock != 0) {
1815                 DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
1816                 start_time = timeval_current();
1817                 if (!ctdb_recovery_lock(ctdb, true)) {
1818                         DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1819                                          "and ban ourself for %u seconds\n",
1820                                          ctdb->tunable.recovery_ban_period));
1821                         ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1822                         return -1;
1823                 }
1824                 ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
1825                 DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
1826         }
1827
1828         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1829
1830         /* get a list of all databases */
1831         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1832         if (ret != 0) {
1833                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1834                 return -1;
1835         }
1836
1837         /* we do the db creation before we set the recovery mode, so the freeze happens
1838            on all databases we will be dealing with. */
1839
1840         /* verify that we have all the databases any other node has */
1841         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1842         if (ret != 0) {
1843                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1844                 return -1;
1845         }
1846
1847         /* verify that all other nodes have all our databases */
1848         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1849         if (ret != 0) {
1850                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1851                 return -1;
1852         }
1853         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1854
1855         /* update the database priority for all remote databases */
1856         ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1857         if (ret != 0) {
1858                 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1859         }
1860         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1861
1862
1863         /* update all other nodes to use the same setting for reclock files
1864            as the local recovery master.
1865         */
1866         sync_recovery_lock_file_across_cluster(rec);
1867
1868         /* set recovery mode to active on all nodes */
1869         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1870         if (ret != 0) {
1871                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1872                 return -1;
1873         }
1874
1875         /* execute the "startrecovery" event script on all nodes */
1876         ret = run_startrecovery_eventscript(rec, nodemap);
1877         if (ret!=0) {
1878                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1879                 return -1;
1880         }
1881
1882         /*
1883           update all nodes to have the same flags that we have
1884          */
1885         for (i=0;i<nodemap->num;i++) {
1886                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1887                         continue;
1888                 }
1889
1890                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1891                 if (ret != 0) {
1892                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1893                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1894                         } else {
1895                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1896                                 return -1;
1897                         }
1898                 }
1899         }
1900
1901         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1902
1903         /* pick a new generation number */
1904         generation = new_generation();
1905
1906         /* change the vnnmap on this node to use the new generation
1907            number but not on any other nodes.
1908            this guarantees that if we abort the recovery prematurely
1909            for some reason (a node stops responding?)
1910            that we can just return immediately and we will reenter
1911            recovery shortly again.
1912            I.e. we deliberately leave the cluster with an inconsistent
1913            generation id to allow us to abort recovery at any stage and
1914            just restart it from scratch.
1915          */
1916         vnnmap->generation = generation;
1917         ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1918         if (ret != 0) {
1919                 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1920                 return -1;
1921         }
1922
1923         data.dptr = (void *)&generation;
1924         data.dsize = sizeof(uint32_t);
1925
1926         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1927         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1928                                         nodes, 0,
1929                                         CONTROL_TIMEOUT(), false, data,
1930                                         NULL,
1931                                         transaction_start_fail_callback,
1932                                         rec) != 0) {
1933                 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1934                 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1935                                         nodes, 0,
1936                                         CONTROL_TIMEOUT(), false, tdb_null,
1937                                         NULL,
1938                                         NULL,
1939                                         NULL) != 0) {
1940                         DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1941                 }
1942                 return -1;
1943         }
1944
1945         DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1946
1947         for (i=0;i<dbmap->num;i++) {
1948                 ret = recover_database(rec, mem_ctx,
1949                                        dbmap->dbs[i].dbid,
1950                                        dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1951                                        pnn, nodemap, generation);
1952                 if (ret != 0) {
1953                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
1954                         return -1;
1955                 }
1956         }
1957
1958         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1959
1960         /* commit all the changes */
1961         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1962                                         nodes, 0,
1963                                         CONTROL_TIMEOUT(), false, data,
1964                                         NULL, NULL,
1965                                         NULL) != 0) {
1966                 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1967                 return -1;
1968         }
1969
1970         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1971
1972
1973         /* update the capabilities for all nodes */
1974         ret = update_capabilities(ctdb, nodemap);
1975         if (ret!=0) {
1976                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1977                 return -1;
1978         }
1979
1980         /* build a new vnn map with all the currently active and
1981            unbanned nodes */
1982         generation = new_generation();
1983         vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1984         CTDB_NO_MEMORY(ctdb, vnnmap);
1985         vnnmap->generation = generation;
1986         vnnmap->size = 0;
1987         vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1988         CTDB_NO_MEMORY(ctdb, vnnmap->map);
1989         for (i=j=0;i<nodemap->num;i++) {
1990                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1991                         continue;
1992                 }
1993                 if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
1994                         /* this node can not be an lmaster */
1995                         DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1996                         continue;
1997                 }
1998
1999                 vnnmap->size++;
2000                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2001                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2002                 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2003
2004         }
2005         if (vnnmap->size == 0) {
2006                 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2007                 vnnmap->size++;
2008                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2009                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2010                 vnnmap->map[0] = pnn;
2011         }
2012
2013         /* update to the new vnnmap on all nodes */
2014         ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2015         if (ret != 0) {
2016                 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2017                 return -1;
2018         }
2019
2020         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2021
2022         /* update recmaster to point to us for all nodes */
2023         ret = set_recovery_master(ctdb, nodemap, pnn);
2024         if (ret!=0) {
2025                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2026                 return -1;
2027         }
2028
2029         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2030
2031         /* disable recovery mode */
2032         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2033         if (ret != 0) {
2034                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2035                 return -1;
2036         }
2037
2038         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2039
2040         /* Fetch known/available public IPs from each active node */
2041         ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2042         if (ret != 0) {
2043                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2044                                  culprit));
2045                 rec->need_takeover_run = true;
2046                 return -1;
2047         }
2048
2049         do_takeover_run(rec, nodemap, false);
2050
2051         /* execute the "recovered" event script on all nodes */
2052         ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2053         if (ret!=0) {
2054                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2055                 return -1;
2056         }
2057
2058         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2059
2060         /* send a message to all clients telling them that the cluster
2061            has been reconfigured */
2062         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2063                                        CTDB_SRVID_RECONFIGURE, tdb_null);
2064         if (ret != 0) {
2065                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2066                 return -1;
2067         }
2068
2069         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2070
2071         rec->need_recovery = false;
2072
2073         /* we managed to complete a full recovery, make sure to forgive
2074            any past sins by the nodes that could now participate in the
2075            recovery.
2076         */
2077         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2078         for (i=0;i<nodemap->num;i++) {
2079                 struct ctdb_banning_state *ban_state;
2080
2081                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2082                         continue;
2083                 }
2084
2085                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2086                 if (ban_state == NULL) {
2087                         continue;
2088                 }
2089
2090                 ban_state->count = 0;
2091         }
2092
2093
2094         /* We just finished a recovery successfully.
2095            We now wait for rerecovery_timeout before we allow
2096            another recovery to take place.
2097         */
2098         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2099         ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
2100         DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
2101
2102         return 0;
2103 }
2104
2105
2106 /*
2107   elections are won by first checking the number of connected nodes, then
2108   the priority time, then the pnn
2109  */
2110 struct election_message {
2111         uint32_t num_connected;
2112         struct timeval priority_time;
2113         uint32_t pnn;
2114         uint32_t node_flags;
2115 };
2116
2117 /*
2118   form this nodes election data
2119  */
2120 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2121 {
2122         int ret, i;
2123         struct ctdb_node_map *nodemap;
2124         struct ctdb_context *ctdb = rec->ctdb;
2125
2126         ZERO_STRUCTP(em);
2127
2128         em->pnn = rec->ctdb->pnn;
2129         em->priority_time = rec->priority_time;
2130
2131         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2132         if (ret != 0) {
2133                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2134                 return;
2135         }
2136
2137         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2138         em->node_flags = rec->node_flags;
2139
2140         for (i=0;i<nodemap->num;i++) {
2141                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2142                         em->num_connected++;
2143                 }
2144         }
2145
2146         /* we shouldnt try to win this election if we cant be a recmaster */
2147         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2148                 em->num_connected = 0;
2149                 em->priority_time = timeval_current();
2150         }
2151
2152         talloc_free(nodemap);
2153 }
2154
2155 /*
2156   see if the given election data wins
2157  */
2158 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2159 {
2160         struct election_message myem;
2161         int cmp = 0;
2162
2163         ctdb_election_data(rec, &myem);
2164
2165         /* we cant win if we dont have the recmaster capability */
2166         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2167                 return false;
2168         }
2169
2170         /* we cant win if we are banned */
2171         if (rec->node_flags & NODE_FLAGS_BANNED) {
2172                 return false;
2173         }
2174
2175         /* we cant win if we are stopped */
2176         if (rec->node_flags & NODE_FLAGS_STOPPED) {
2177                 return false;
2178         }
2179
2180         /* we will automatically win if the other node is banned */
2181         if (em->node_flags & NODE_FLAGS_BANNED) {
2182                 return true;
2183         }
2184
2185         /* we will automatically win if the other node is banned */
2186         if (em->node_flags & NODE_FLAGS_STOPPED) {
2187                 return true;
2188         }
2189
2190         /* try to use the most connected node */
2191         if (cmp == 0) {
2192                 cmp = (int)myem.num_connected - (int)em->num_connected;
2193         }
2194
2195         /* then the longest running node */
2196         if (cmp == 0) {
2197                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2198         }
2199
2200         if (cmp == 0) {
2201                 cmp = (int)myem.pnn - (int)em->pnn;
2202         }
2203
2204         return cmp > 0;
2205 }
2206
2207 /*
2208   send out an election request
2209  */
2210 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2211 {
2212         int ret;
2213         TDB_DATA election_data;
2214         struct election_message emsg;
2215         uint64_t srvid;
2216         struct ctdb_context *ctdb = rec->ctdb;
2217
2218         srvid = CTDB_SRVID_RECOVERY;
2219
2220         ctdb_election_data(rec, &emsg);
2221
2222         election_data.dsize = sizeof(struct election_message);
2223         election_data.dptr  = (unsigned char *)&emsg;
2224
2225
2226         /* first we assume we will win the election and set
2227            recoverymaster to be ourself on the current node
2228          */
2229         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2230         if (ret != 0) {
2231                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2232                 return -1;
2233         }
2234
2235
2236         /* send an election message to all active nodes */
2237         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2238         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2239 }
2240
2241 /*
2242   this function will unban all nodes in the cluster
2243 */
2244 static void unban_all_nodes(struct ctdb_context *ctdb)
2245 {
2246         int ret, i;
2247         struct ctdb_node_map *nodemap;
2248         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2249
2250         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2251         if (ret != 0) {
2252                 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2253                 return;
2254         }
2255
2256         for (i=0;i<nodemap->num;i++) {
2257                 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2258                   && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2259                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2260                                                  nodemap->nodes[i].pnn, 0,
2261                                                  NODE_FLAGS_BANNED);
2262                         if (ret != 0) {
2263                                 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2264                         }
2265                 }
2266         }
2267
2268         talloc_free(tmp_ctx);
2269 }
2270
2271
2272 /*
2273   we think we are winning the election - send a broadcast election request
2274  */
2275 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2276 {
2277         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2278         int ret;
2279
2280         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2281         if (ret != 0) {
2282                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2283         }
2284
2285         talloc_free(rec->send_election_te);
2286         rec->send_election_te = NULL;
2287 }
2288
2289 /*
2290   handler for memory dumps
2291 */
2292 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2293                              TDB_DATA data, void *private_data)
2294 {
2295         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2296         TDB_DATA *dump;
2297         int ret;
2298         struct srvid_request *rd;
2299
2300         if (data.dsize != sizeof(struct srvid_request)) {
2301                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2302                 talloc_free(tmp_ctx);
2303                 return;
2304         }
2305         rd = (struct srvid_request *)data.dptr;
2306
2307         dump = talloc_zero(tmp_ctx, TDB_DATA);
2308         if (dump == NULL) {
2309                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2310                 talloc_free(tmp_ctx);
2311                 return;
2312         }
2313         ret = ctdb_dump_memory(ctdb, dump);
2314         if (ret != 0) {
2315                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2316                 talloc_free(tmp_ctx);
2317                 return;
2318         }
2319
2320 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2321
2322         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2323         if (ret != 0) {
2324                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2325                 talloc_free(tmp_ctx);
2326                 return;
2327         }
2328
2329         talloc_free(tmp_ctx);
2330 }
2331
2332 /*
2333   handler for getlog
2334 */
2335 static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2336                            TDB_DATA data, void *private_data)
2337 {
2338         struct ctdb_get_log_addr *log_addr;
2339         pid_t child;
2340
2341         if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
2342                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2343                 return;
2344         }
2345         log_addr = (struct ctdb_get_log_addr *)data.dptr;
2346
2347         child = ctdb_fork_no_free_ringbuffer(ctdb);
2348         if (child == (pid_t)-1) {
2349                 DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
2350                 return;
2351         }
2352
2353         if (child == 0) {
2354                 ctdb_set_process_name("ctdb_rec_log_collector");
2355                 if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
2356                         DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
2357                         _exit(1);
2358                 }
2359                 ctdb_collect_log(ctdb, log_addr);
2360                 _exit(0);
2361         }
2362 }
2363
2364 /*
2365   handler for clearlog
2366 */
2367 static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid,
2368                              TDB_DATA data, void *private_data)
2369 {
2370         ctdb_clear_log(ctdb);
2371 }
2372
2373 /*
2374   handler for reload_nodes
2375 */
2376 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2377                              TDB_DATA data, void *private_data)
2378 {
2379         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2380
2381         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2382
2383         ctdb_load_nodes_file(rec->ctdb);
2384 }
2385
2386
2387 static void ctdb_rebalance_timeout(struct event_context *ev,
2388                                    struct timed_event *te,
2389                                    struct timeval t, void *p)
2390 {
2391         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2392
2393         if (rec->force_rebalance_nodes == NULL) {
2394                 DEBUG(DEBUG_ERR,
2395                       ("Rebalance timeout occurred - no nodes to rebalance\n"));
2396                 return;
2397         }
2398
2399         DEBUG(DEBUG_NOTICE,
2400               ("Rebalance timeout occurred - do takeover run\n"));
2401         do_takeover_run(rec, rec->nodemap, false);
2402 }
2403
2404
2405 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2406                                         uint64_t srvid,
2407                                         TDB_DATA data, void *private_data)
2408 {
2409         uint32_t pnn;
2410         uint32_t *t;
2411         int len;
2412         uint32_t deferred_rebalance;
2413         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2414
2415         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2416                 return;
2417         }
2418
2419         if (data.dsize != sizeof(uint32_t)) {
2420                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2421                 return;
2422         }
2423
2424         pnn = *(uint32_t *)&data.dptr[0];
2425
2426         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2427
2428         /* Copy any existing list of nodes.  There's probably some
2429          * sort of realloc variant that will do this but we need to
2430          * make sure that freeing the old array also cancels the timer
2431          * event for the timeout... not sure if realloc will do that.
2432          */
2433         len = (rec->force_rebalance_nodes != NULL) ?
2434                 talloc_array_length(rec->force_rebalance_nodes) :
2435                 0;
2436
2437         /* This allows duplicates to be added but they don't cause
2438          * harm.  A call to add a duplicate PNN arguably means that
2439          * the timeout should be reset, so this is the simplest
2440          * solution.
2441          */
2442         t = talloc_zero_array(rec, uint32_t, len+1);
2443         CTDB_NO_MEMORY_VOID(ctdb, t);
2444         if (len > 0) {
2445                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2446         }
2447         t[len] = pnn;
2448
2449         talloc_free(rec->force_rebalance_nodes);
2450
2451         rec->force_rebalance_nodes = t;
2452
2453         /* If configured, setup a deferred takeover run to make sure
2454          * that certain nodes get IPs rebalanced to them.  This will
2455          * be cancelled if a successful takeover run happens before
2456          * the timeout.  Assign tunable value to variable for
2457          * readability.
2458          */
2459         deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2460         if (deferred_rebalance != 0) {
2461                 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2462                                 timeval_current_ofs(deferred_rebalance, 0),
2463                                 ctdb_rebalance_timeout, rec);
2464         }
2465 }
2466
2467
2468
2469 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2470                              TDB_DATA data, void *private_data)
2471 {
2472         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2473         struct ctdb_public_ip *ip;
2474
2475         if (rec->recmaster != rec->ctdb->pnn) {
2476                 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2477                 return;
2478         }
2479
2480         if (data.dsize != sizeof(struct ctdb_public_ip)) {
2481                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2482                 return;
2483         }
2484
2485         ip = (struct ctdb_public_ip *)data.dptr;
2486
2487         update_ip_assignment_tree(rec->ctdb, ip);
2488 }
2489
2490
2491 static void clear_takeover_runs_disable(struct ctdb_recoverd *rec)
2492 {
2493         TALLOC_FREE(rec->takeover_runs_disable_ctx);
2494 }
2495
2496 static void reenable_takeover_runs(struct event_context *ev,
2497                                    struct timed_event *te,
2498                                    struct timeval yt, void *p)
2499 {
2500         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2501
2502         DEBUG(DEBUG_NOTICE,("Reenabling takeover runs after timeout\n"));
2503         clear_takeover_runs_disable(rec);
2504 }
2505
2506 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2507                                           uint64_t srvid, TDB_DATA data,
2508                                           void *private_data)
2509 {
2510         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2511                                                     struct ctdb_recoverd);
2512         struct srvid_request_data *r;
2513         uint32_t timeout;
2514         TDB_DATA result;
2515         int32_t ret = 0;
2516
2517         /* Validate input data */
2518         if (data.dsize != sizeof(struct srvid_request_data)) {
2519                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2520                                  "expecting %lu\n", (long unsigned)data.dsize,
2521                                  (long unsigned)sizeof(struct srvid_request)));
2522                 return;
2523         }
2524         if (data.dptr == NULL) {
2525                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2526                 return;
2527         }
2528
2529         r = (struct srvid_request_data *)data.dptr;
2530         timeout = r->data;
2531
2532         if (timeout == 0) {
2533                 DEBUG(DEBUG_NOTICE,("Reenabling takeover runs\n"));
2534                 clear_takeover_runs_disable(rec);
2535                 ret = ctdb_get_pnn(ctdb);
2536                 goto done;
2537         }
2538
2539         if (rec->takeover_run_in_progress) {
2540                 DEBUG(DEBUG_ERR,
2541                       ("Unable to disable takeover runs - in progress\n"));
2542                 ret = -EAGAIN;
2543                 goto done;
2544         }
2545
2546         DEBUG(DEBUG_NOTICE,("Disabling takeover runs for %u seconds\n", timeout));
2547
2548         /* Clear any old timers */
2549         clear_takeover_runs_disable(rec);
2550
2551         /* When this is non-NULL it indicates that takeover runs are
2552          * disabled.  This context also holds the timeout timer.
2553          */
2554         rec->takeover_runs_disable_ctx = talloc_new(rec);
2555         if (rec->takeover_runs_disable_ctx == NULL) {
2556                 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate memory\n"));
2557                 ret = -ENOMEM;
2558                 goto done;
2559         }
2560
2561         /* Arrange for the timeout to occur */
2562         event_add_timed(ctdb->ev, rec->takeover_runs_disable_ctx,
2563                         timeval_current_ofs(timeout, 0),
2564                         reenable_takeover_runs,
2565                         rec);
2566
2567         /* Returning our PNN tells the caller that we succeeded */
2568         ret = ctdb_get_pnn(ctdb);
2569 done:
2570         result.dsize = sizeof(int32_t);
2571         result.dptr  = (uint8_t *)&ret;
2572         srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2573 }
2574
2575 /* Backward compatibility for this SRVID - call
2576  * disable_takeover_runs_handler() instead
2577  */
2578 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2579                                      TDB_DATA data, void *private_data)
2580 {
2581         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2582                                                     struct ctdb_recoverd);
2583         TDB_DATA data2;
2584         struct srvid_request_data *req;
2585
2586         if (data.dsize != sizeof(uint32_t)) {
2587                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2588                                  "expecting %lu\n", (long unsigned)data.dsize,
2589                                  (long unsigned)sizeof(uint32_t)));
2590                 return;
2591         }
2592         if (data.dptr == NULL) {
2593                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2594                 return;
2595         }
2596
2597         req = talloc(ctdb, struct srvid_request_data);
2598         CTDB_NO_MEMORY_VOID(ctdb, req);
2599
2600         req->srvid = 0; /* No reply */
2601         req->pnn = -1;
2602         req->data = *((uint32_t *)data.dptr); /* Timeout */
2603
2604         data2.dsize = sizeof(*req);
2605         data2.dptr = (uint8_t *)req;
2606
2607         disable_takeover_runs_handler(rec->ctdb,
2608                                       CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
2609                                       data2, rec);
2610 }
2611
2612 /*
2613   handler for ip reallocate, just add it to the list of requests and
2614   handle this later in the monitor_cluster loop so we do not recurse
2615   with other requests to takeover_run()
2616 */
2617 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2618                                   TDB_DATA data, void *private_data)
2619 {
2620         struct srvid_request *request;
2621         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2622                                                     struct ctdb_recoverd);
2623
2624         if (data.dsize != sizeof(struct srvid_request)) {
2625                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2626                 return;
2627         }
2628
2629         request = (struct srvid_request *)data.dptr;
2630
2631         srvid_request_add(ctdb, &rec->reallocate_requests, request);
2632 }
2633
2634 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2635                                           struct ctdb_recoverd *rec)
2636 {
2637         TDB_DATA result;
2638         int32_t ret;
2639         uint32_t culprit;
2640         struct srvid_requests *current;
2641
2642         DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2643
2644         /* Only process requests that are currently pending.  More
2645          * might come in while the takeover run is in progress and
2646          * they will need to be processed later since they might
2647          * be in response flag changes.
2648          */
2649         current = rec->reallocate_requests;
2650         rec->reallocate_requests = NULL;
2651
2652         /* update the list of public ips that a node can handle for
2653            all connected nodes
2654         */
2655         ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2656         if (ret != 0) {
2657                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2658                                  culprit));
2659                 rec->need_takeover_run = true;
2660         }
2661         if (ret == 0) {
2662                 if (do_takeover_run(rec, rec->nodemap, false)) {
2663                         ret = ctdb_get_pnn(ctdb);
2664                 } else {
2665                         ret = -1;
2666                 }
2667         }
2668
2669         result.dsize = sizeof(int32_t);
2670         result.dptr  = (uint8_t *)&ret;
2671
2672         srvid_requests_reply(ctdb, &current, result);
2673 }
2674
2675
2676 /*
2677   handler for recovery master elections
2678 */
2679 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2680                              TDB_DATA data, void *private_data)
2681 {
2682         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2683         int ret;
2684         struct election_message *em = (struct election_message *)data.dptr;
2685         TALLOC_CTX *mem_ctx;
2686
2687         /* Ignore election packets from ourself */
2688         if (ctdb->pnn == em->pnn) {
2689                 return;
2690         }
2691
2692         /* we got an election packet - update the timeout for the election */
2693         talloc_free(rec->election_timeout);
2694         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2695                                                 fast_start ?
2696                                                 timeval_current_ofs(0, 500000) :
2697                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2698                                                 ctdb_election_timeout, rec);
2699
2700         mem_ctx = talloc_new(ctdb);
2701
2702         /* someone called an election. check their election data
2703            and if we disagree and we would rather be the elected node,
2704            send a new election message to all other nodes
2705          */
2706         if (ctdb_election_win(rec, em)) {
2707                 if (!rec->send_election_te) {
2708                         rec->send_election_te = event_add_timed(ctdb->ev, rec,
2709                                                                 timeval_current_ofs(0, 500000),
2710                                                                 election_send_request, rec);
2711                 }
2712                 talloc_free(mem_ctx);
2713                 /*unban_all_nodes(ctdb);*/
2714                 return;
2715         }
2716
2717         /* we didn't win */
2718         talloc_free(rec->send_election_te);
2719         rec->send_election_te = NULL;
2720
2721         if (ctdb->tunable.verify_recovery_lock != 0) {
2722                 /* release the recmaster lock */
2723                 if (em->pnn != ctdb->pnn &&
2724                     ctdb->recovery_lock_fd != -1) {
2725                         DEBUG(DEBUG_NOTICE, ("Release the recovery lock\n"));
2726                         close(ctdb->recovery_lock_fd);
2727                         ctdb->recovery_lock_fd = -1;
2728                         unban_all_nodes(ctdb);
2729                 }
2730         }
2731
2732         /* ok, let that guy become recmaster then */
2733         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2734         if (ret != 0) {
2735                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2736                 talloc_free(mem_ctx);
2737                 return;
2738         }
2739
2740         talloc_free(mem_ctx);
2741         return;
2742 }
2743
2744
2745 /*
2746   force the start of the election process
2747  */
2748 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2749                            struct ctdb_node_map *nodemap)
2750 {
2751         int ret;
2752         struct ctdb_context *ctdb = rec->ctdb;
2753
2754         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2755
2756         /* set all nodes to recovery mode to stop all internode traffic */
2757         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2758         if (ret != 0) {
2759                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2760                 return;
2761         }
2762
2763         talloc_free(rec->election_timeout);
2764         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2765                                                 fast_start ?
2766                                                 timeval_current_ofs(0, 500000) :
2767                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2768                                                 ctdb_election_timeout, rec);
2769
2770         ret = send_election_request(rec, pnn);
2771         if (ret!=0) {
2772                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2773                 return;
2774         }
2775
2776         /* wait for a few seconds to collect all responses */
2777         ctdb_wait_election(rec);
2778 }
2779
2780
2781
2782 /*
2783   handler for when a node changes its flags
2784 */
2785 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2786                             TDB_DATA data, void *private_data)
2787 {
2788         int ret;
2789         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2790         struct ctdb_node_map *nodemap=NULL;
2791         TALLOC_CTX *tmp_ctx;
2792         int i;
2793         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2794         int disabled_flag_changed;
2795
2796         if (data.dsize != sizeof(*c)) {
2797                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2798                 return;
2799         }
2800
2801         tmp_ctx = talloc_new(ctdb);
2802         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2803
2804         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2805         if (ret != 0) {
2806                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2807                 talloc_free(tmp_ctx);
2808                 return;
2809         }
2810
2811
2812         for (i=0;i<nodemap->num;i++) {
2813                 if (nodemap->nodes[i].pnn == c->pnn) break;
2814         }
2815
2816         if (i == nodemap->num) {
2817                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2818                 talloc_free(tmp_ctx);
2819                 return;
2820         }
2821
2822         if (c->old_flags != c->new_flags) {
2823                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2824         }
2825
2826         disabled_flag_changed =  (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2827
2828         nodemap->nodes[i].flags = c->new_flags;
2829
2830         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2831                                      CTDB_CURRENT_NODE, &ctdb->recovery_master);
2832
2833         if (ret == 0) {
2834                 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2835                                            CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2836         }
2837
2838         if (ret == 0 &&
2839             ctdb->recovery_master == ctdb->pnn &&
2840             ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2841                 /* Only do the takeover run if the perm disabled or unhealthy
2842                    flags changed since these will cause an ip failover but not
2843                    a recovery.
2844                    If the node became disconnected or banned this will also
2845                    lead to an ip address failover but that is handled
2846                    during recovery
2847                 */
2848                 if (disabled_flag_changed) {
2849                         rec->need_takeover_run = true;
2850                 }
2851         }
2852
2853         talloc_free(tmp_ctx);
2854 }
2855
2856 /*
2857   handler for when we need to push out flag changes ot all other nodes
2858 */
2859 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2860                             TDB_DATA data, void *private_data)
2861 {
2862         int ret;
2863         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2864         struct ctdb_node_map *nodemap=NULL;
2865         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2866         uint32_t recmaster;
2867         uint32_t *nodes;
2868
2869         /* find the recovery master */
2870         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2871         if (ret != 0) {
2872                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2873                 talloc_free(tmp_ctx);
2874                 return;
2875         }
2876
2877         /* read the node flags from the recmaster */
2878         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2879         if (ret != 0) {
2880                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2881                 talloc_free(tmp_ctx);
2882                 return;
2883         }
2884         if (c->pnn >= nodemap->num) {
2885                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2886                 talloc_free(tmp_ctx);
2887                 return;
2888         }
2889
2890         /* send the flags update to all connected nodes */
2891         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2892
2893         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2894                                       nodes, 0, CONTROL_TIMEOUT(),
2895                                       false, data,
2896                                       NULL, NULL,
2897                                       NULL) != 0) {
2898                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2899
2900                 talloc_free(tmp_ctx);
2901                 return;
2902         }
2903
2904         talloc_free(tmp_ctx);
2905 }
2906
2907
2908 struct verify_recmode_normal_data {
2909         uint32_t count;
2910         enum monitor_result status;
2911 };
2912
2913 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2914 {
2915         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2916
2917
2918         /* one more node has responded with recmode data*/
2919         rmdata->count--;
2920
2921         /* if we failed to get the recmode, then return an error and let
2922            the main loop try again.
2923         */
2924         if (state->state != CTDB_CONTROL_DONE) {
2925                 if (rmdata->status == MONITOR_OK) {
2926                         rmdata->status = MONITOR_FAILED;
2927                 }
2928                 return;
2929         }
2930
2931         /* if we got a response, then the recmode will be stored in the
2932            status field
2933         */
2934         if (state->status != CTDB_RECOVERY_NORMAL) {
2935                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2936                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2937         }
2938
2939         return;
2940 }
2941
2942
2943 /* verify that all nodes are in normal recovery mode */
2944 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2945 {
2946         struct verify_recmode_normal_data *rmdata;
2947         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2948         struct ctdb_client_control_state *state;
2949         enum monitor_result status;
2950         int j;
2951
2952         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2953         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2954         rmdata->count  = 0;
2955         rmdata->status = MONITOR_OK;
2956
2957         /* loop over all active nodes and send an async getrecmode call to
2958            them*/
2959         for (j=0; j<nodemap->num; j++) {
2960                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2961                         continue;
2962                 }
2963                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2964                                         CONTROL_TIMEOUT(),
2965                                         nodemap->nodes[j].pnn);
2966                 if (state == NULL) {
2967                         /* we failed to send the control, treat this as
2968                            an error and try again next iteration
2969                         */
2970                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2971                         talloc_free(mem_ctx);
2972                         return MONITOR_FAILED;
2973                 }
2974
2975                 /* set up the callback functions */
2976                 state->async.fn = verify_recmode_normal_callback;
2977                 state->async.private_data = rmdata;
2978
2979                 /* one more control to wait for to complete */
2980                 rmdata->count++;
2981         }
2982
2983
2984         /* now wait for up to the maximum number of seconds allowed
2985            or until all nodes we expect a response from has replied
2986         */
2987         while (rmdata->count > 0) {
2988                 event_loop_once(ctdb->ev);
2989         }
2990
2991         status = rmdata->status;
2992         talloc_free(mem_ctx);
2993         return status;
2994 }
2995
2996
2997 struct verify_recmaster_data {
2998         struct ctdb_recoverd *rec;
2999         uint32_t count;
3000         uint32_t pnn;
3001         enum monitor_result status;
3002 };
3003
3004 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3005 {
3006         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3007
3008
3009         /* one more node has responded with recmaster data*/
3010         rmdata->count--;
3011
3012         /* if we failed to get the recmaster, then return an error and let
3013            the main loop try again.
3014         */
3015         if (state->state != CTDB_CONTROL_DONE) {
3016                 if (rmdata->status == MONITOR_OK) {
3017                         rmdata->status = MONITOR_FAILED;
3018                 }
3019                 return;
3020         }
3021
3022         /* if we got a response, then the recmaster will be stored in the
3023            status field
3024         */
3025         if (state->status != rmdata->pnn) {
3026                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3027                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3028                 rmdata->status = MONITOR_ELECTION_NEEDED;
3029         }
3030
3031         return;
3032 }
3033
3034
3035 /* verify that all nodes agree that we are the recmaster */
3036 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
3037 {
3038         struct ctdb_context *ctdb = rec->ctdb;
3039         struct verify_recmaster_data *rmdata;
3040         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3041         struct ctdb_client_control_state *state;
3042         enum monitor_result status;
3043         int j;
3044
3045         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3046         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3047         rmdata->rec    = rec;
3048         rmdata->count  = 0;
3049         rmdata->pnn    = pnn;
3050         rmdata->status = MONITOR_OK;
3051
3052         /* loop over all active nodes and send an async getrecmaster call to
3053            them*/
3054         for (j=0; j<nodemap->num; j++) {
3055                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3056                         continue;
3057                 }
3058                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3059                                         CONTROL_TIMEOUT(),
3060                                         nodemap->nodes[j].pnn);
3061                 if (state == NULL) {
3062                         /* we failed to send the control, treat this as
3063                            an error and try again next iteration
3064                         */
3065                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3066                         talloc_free(mem_ctx);
3067                         return MONITOR_FAILED;
3068                 }
3069
3070                 /* set up the callback functions */
3071                 state->async.fn = verify_recmaster_callback;
3072                 state->async.private_data = rmdata;
3073
3074                 /* one more control to wait for to complete */
3075                 rmdata->count++;
3076         }
3077
3078
3079         /* now wait for up to the maximum number of seconds allowed
3080            or until all nodes we expect a response from has replied
3081         */
3082         while (rmdata->count > 0) {
3083                 event_loop_once(ctdb->ev);
3084         }
3085
3086         status = rmdata->status;
3087         talloc_free(mem_ctx);
3088         return status;
3089 }
3090
3091 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3092                                     struct ctdb_recoverd *rec)
3093 {
3094         struct ctdb_control_get_ifaces *ifaces = NULL;
3095         TALLOC_CTX *mem_ctx;
3096         bool ret = false;
3097
3098         mem_ctx = talloc_new(NULL);
3099
3100         /* Read the interfaces from the local node */
3101         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3102                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3103                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3104                 /* We could return an error.  However, this will be
3105                  * rare so we'll decide that the interfaces have
3106                  * actually changed, just in case.
3107                  */
3108                 talloc_free(mem_ctx);
3109                 return true;
3110         }
3111
3112         if (!rec->ifaces) {
3113                 /* We haven't been here before so things have changed */
3114                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3115                 ret = true;
3116         } else if (rec->ifaces->num != ifaces->num) {
3117                 /* Number of interfaces has changed */
3118                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3119                                      rec->ifaces->num, ifaces->num));
3120                 ret = true;
3121         } else {
3122                 /* See if interface names or link states have changed */
3123                 int i;
3124                 for (i = 0; i < rec->ifaces->num; i++) {
3125                         struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3126                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3127                                 DEBUG(DEBUG_NOTICE,
3128                                       ("Interface in slot %d changed: %s => %s\n",
3129                                        i, iface->name, ifaces->ifaces[i].name));
3130                                 ret = true;
3131                                 break;
3132                         }
3133                         if (iface->link_state != ifaces->ifaces[i].link_state) {
3134                                 DEBUG(DEBUG_NOTICE,
3135                                       ("Interface %s changed state: %d => %d\n",
3136                                        iface->name, iface->link_state,
3137                                        ifaces->ifaces[i].link_state));
3138                                 ret = true;
3139                                 break;
3140                         }
3141                 }
3142         }
3143
3144         talloc_free(rec->ifaces);
3145         rec->ifaces = talloc_steal(rec, ifaces);
3146
3147         talloc_free(mem_ctx);
3148         return ret;
3149 }
3150
3151 /* called to check that the local allocation of public ip addresses is ok.
3152 */
3153 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3154 {
3155         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3156         struct ctdb_uptime *uptime1 = NULL;
3157         struct ctdb_uptime *uptime2 = NULL;
3158         int ret, j;
3159         bool need_takeover_run = false;
3160
3161         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3162                                 CTDB_CURRENT_NODE, &uptime1);
3163         if (ret != 0) {
3164                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3165                 talloc_free(mem_ctx);
3166                 return -1;
3167         }
3168
3169         if (interfaces_have_changed(ctdb, rec)) {
3170                 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3171                                      "local node %u - force takeover run\n",
3172                                      pnn));
3173                 need_takeover_run = true;
3174         }
3175
3176         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3177                                 CTDB_CURRENT_NODE, &uptime2);
3178         if (ret != 0) {
3179                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3180                 talloc_free(mem_ctx);
3181                 return -1;
3182         }
3183
3184         /* skip the check if the startrecovery time has changed */
3185         if (timeval_compare(&uptime1->last_recovery_started,
3186                             &uptime2->last_recovery_started) != 0) {
3187                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3188                 talloc_free(mem_ctx);
3189                 return 0;
3190         }
3191
3192         /* skip the check if the endrecovery time has changed */
3193         if (timeval_compare(&uptime1->last_recovery_finished,
3194                             &uptime2->last_recovery_finished) != 0) {
3195                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3196                 talloc_free(mem_ctx);
3197                 return 0;
3198         }
3199
3200         /* skip the check if we have started but not finished recovery */
3201         if (timeval_compare(&uptime1->last_recovery_finished,
3202                             &uptime1->last_recovery_started) != 1) {
3203                 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3204                 talloc_free(mem_ctx);
3205
3206                 return 0;
3207         }
3208
3209         /* verify that we have the ip addresses we should have
3210            and we dont have ones we shouldnt have.
3211            if we find an inconsistency we set recmode to
3212            active on the local node and wait for the recmaster
3213            to do a full blown recovery.
3214            also if the pnn is -1 and we are healthy and can host the ip
3215            we also request a ip reallocation.
3216         */
3217         if (ctdb->tunable.disable_ip_failover == 0) {
3218                 struct ctdb_all_public_ips *ips = NULL;
3219
3220                 /* read the *available* IPs from the local node */
3221                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3222                 if (ret != 0) {
3223                         DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3224                         talloc_free(mem_ctx);
3225                         return -1;
3226                 }
3227
3228                 for (j=0; j<ips->num; j++) {
3229                         if (ips->ips[j].pnn == -1 &&
3230                             nodemap->nodes[pnn].flags == 0) {
3231                                 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3232                                                   ctdb_addr_to_str(&ips->ips[j].addr)));
3233                                 need_takeover_run = true;
3234                         }
3235                 }
3236
3237                 talloc_free(ips);
3238
3239                 /* read the *known* IPs from the local node */
3240                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3241                 if (ret != 0) {
3242                         DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3243                         talloc_free(mem_ctx);
3244                         return -1;
3245                 }
3246
3247                 for (j=0; j<ips->num; j++) {
3248                         if (ips->ips[j].pnn == pnn) {
3249                                 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3250                                         DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3251                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3252                                         need_takeover_run = true;
3253                                 }
3254                         } else {
3255                                 if (ctdb->do_checkpublicip &&
3256                                     ctdb_sys_have_ip(&ips->ips[j].addr)) {
3257
3258                                         DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3259                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3260
3261                                         if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3262                                                 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3263                                         }
3264                                 }
3265                         }
3266                 }
3267         }
3268
3269         if (need_takeover_run) {
3270                 struct srvid_request rd;
3271                 TDB_DATA data;
3272
3273                 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3274
3275                 rd.pnn = ctdb->pnn;
3276                 rd.srvid = 0;
3277                 data.dptr = (uint8_t *)&rd;
3278                 data.dsize = sizeof(rd);
3279
3280                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3281                 if (ret != 0) {
3282                         DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3283                 }
3284         }
3285         talloc_free(mem_ctx);
3286         return 0;
3287 }
3288
3289
3290 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3291 {
3292         struct ctdb_node_map **remote_nodemaps = callback_data;
3293
3294         if (node_pnn >= ctdb->num_nodes) {
3295                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3296                 return;
3297         }
3298
3299         remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3300
3301 }
3302
3303 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3304         struct ctdb_node_map *nodemap,
3305         struct ctdb_node_map **remote_nodemaps)
3306 {
3307         uint32_t *nodes;
3308
3309         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3310         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3311                                         nodes, 0,
3312                                         CONTROL_TIMEOUT(), false, tdb_null,
3313                                         async_getnodemap_callback,
3314                                         NULL,
3315                                         remote_nodemaps) != 0) {
3316                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3317
3318                 return -1;
3319         }
3320
3321         return 0;
3322 }
3323
3324 enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
3325 struct ctdb_check_reclock_state {
3326         struct ctdb_context *ctdb;
3327         struct timeval start_time;
3328         int fd[2];
3329         pid_t child;
3330         struct timed_event *te;
3331         struct fd_event *fde;
3332         enum reclock_child_status status;
3333 };
3334
3335 /* when we free the reclock state we must kill any child process.
3336 */
3337 static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
3338 {
3339         struct ctdb_context *ctdb = state->ctdb;
3340
3341         ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
3342
3343         if (state->fd[0] != -1) {
3344                 close(state->fd[0]);
3345                 state->fd[0] = -1;
3346         }
3347         if (state->fd[1] != -1) {
3348                 close(state->fd[1]);
3349                 state->fd[1] = -1;
3350         }
3351         ctdb_kill(ctdb, state->child, SIGKILL);
3352         return 0;
3353 }
3354
3355 /*
3356   called if our check_reclock child times out. this would happen if
3357   i/o to the reclock file blocks.
3358  */
3359 static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te,
3360                                          struct timeval t, void *private_data)
3361 {
3362         struct ctdb_check_reclock_state *state = talloc_get_type(private_data,
3363                                            struct ctdb_check_reclock_state);
3364
3365         DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
3366         state->status = RECLOCK_TIMEOUT;
3367 }
3368
3369 /* this is called when the child process has completed checking the reclock
3370    file and has written data back to us through the pipe.
3371 */
3372 static void reclock_child_handler(struct event_context *ev, struct fd_event *fde,
3373                              uint16_t flags, void *private_data)
3374 {
3375         struct ctdb_check_reclock_state *state= talloc_get_type(private_data,
3376                                              struct ctdb_check_reclock_state);
3377         char c = 0;
3378         int ret;
3379
3380         /* we got a response from our child process so we can abort the
3381            timeout.
3382         */
3383         talloc_free(state->te);
3384         state->te = NULL;
3385
3386         ret = read(state->fd[0], &c, 1);
3387         if (ret != 1 || c != RECLOCK_OK) {
3388                 DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
3389                 state->status = RECLOCK_FAILED;
3390
3391                 return;
3392         }
3393
3394         state->status = RECLOCK_OK;
3395         return;
3396 }
3397
3398 static int check_recovery_lock(struct ctdb_context *ctdb)
3399 {
3400         int ret;
3401         struct ctdb_check_reclock_state *state;
3402         pid_t parent = getpid();
3403
3404         if (ctdb->recovery_lock_fd == -1) {
3405                 DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
3406                 return -1;
3407         }
3408
3409         state = talloc(ctdb, struct ctdb_check_reclock_state);
3410         CTDB_NO_MEMORY(ctdb, state);
3411
3412         state->ctdb = ctdb;
3413         state->start_time = timeval_current();
3414         state->status = RECLOCK_CHECKING;
3415         state->fd[0] = -1;
3416         state->fd[1] = -1;
3417
3418         ret = pipe(state->fd);
3419         if (ret != 0) {
3420                 talloc_free(state);
3421                 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
3422                 return -1;
3423         }
3424
3425         state->child = ctdb_fork(ctdb);
3426         if (state->child == (pid_t)-1) {
3427                 DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
3428                 close(state->fd[0]);
3429                 state->fd[0] = -1;
3430                 close(state->fd[1]);
3431                 state->fd[1] = -1;
3432                 talloc_free(state);
3433                 return -1;
3434         }
3435
3436         if (state->child == 0) {
3437                 char cc = RECLOCK_OK;
3438                 close(state->fd[0]);
3439                 state->fd[0] = -1;
3440
3441                 ctdb_set_process_name("ctdb_rec_reclock");
3442                 debug_extra = talloc_asprintf(NULL, "recovery-lock:");
3443                 if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
3444                         DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
3445                         cc = RECLOCK_FAILED;
3446                 }
3447
3448                 write(state->fd[1], &cc, 1);
3449                 /* make sure we die when our parent dies */
3450                 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3451                         sleep(5);
3452                 }
3453                 _exit(0);
3454         }
3455         close(state->fd[1]);
3456         state->fd[1] = -1;
3457         set_close_on_exec(state->fd[0]);
3458
3459         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
3460
3461         talloc_set_destructor(state, check_reclock_destructor);
3462
3463         state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
3464                                     ctdb_check_reclock_timeout, state);
3465         if (state->te == NULL) {
3466                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
3467                 talloc_free(state);
3468                 return -1;
3469         }
3470
3471         state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
3472                                 EVENT_FD_READ,
3473                                 reclock_child_handler,
3474                                 (void *)state);
3475
3476         if (state->fde == NULL) {
3477                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
3478                 talloc_free(state);
3479                 return -1;
3480         }
3481         tevent_fd_set_auto_close(state->fde);
3482
3483         while (state->status == RECLOCK_CHECKING) {
3484                 event_loop_once(ctdb->ev);
3485         }
3486
3487         if (state->status == RECLOCK_FAILED) {
3488                 DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
3489                 close(ctdb->recovery_lock_fd);
3490                 ctdb->recovery_lock_fd = -1;
3491                 talloc_free(state);
3492                 return -1;
3493         }
3494
3495         talloc_free(state);
3496         return 0;
3497 }
3498
3499 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3500 {
3501         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3502         const char *reclockfile;
3503
3504         if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3505                 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3506                 talloc_free(tmp_ctx);
3507                 return -1;
3508         }
3509
3510         if (reclockfile == NULL) {
3511                 if (ctdb->recovery_lock_file != NULL) {
3512                         DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
3513                         talloc_free(ctdb->recovery_lock_file);
3514                         ctdb->recovery_lock_file = NULL;
3515                         if (ctdb->recovery_lock_fd != -1) {
3516                                 close(ctdb->recovery_lock_fd);
3517                                 ctdb->recovery_lock_fd = -1;
3518                         }
3519                 }
3520                 ctdb->tunable.verify_recovery_lock = 0;
3521                 talloc_free(tmp_ctx);
3522                 return 0;
3523         }
3524
3525         if (ctdb->recovery_lock_file == NULL) {
3526                 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3527                 if (ctdb->recovery_lock_fd != -1) {
3528                         close(ctdb->recovery_lock_fd);
3529                         ctdb->recovery_lock_fd = -1;
3530                 }
3531                 talloc_free(tmp_ctx);
3532                 return 0;
3533         }
3534
3535
3536         if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3537                 talloc_free(tmp_ctx);
3538                 return 0;
3539         }
3540
3541         talloc_free(ctdb->recovery_lock_file);
3542         ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3543         ctdb->tunable.verify_recovery_lock = 0;
3544         if (ctdb->recovery_lock_fd != -1) {
3545                 close(ctdb->recovery_lock_fd);
3546                 ctdb->recovery_lock_fd = -1;
3547         }
3548
3549         talloc_free(tmp_ctx);
3550         return 0;
3551 }
3552
3553 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3554                       TALLOC_CTX *mem_ctx)
3555 {
3556         uint32_t pnn;
3557         struct ctdb_node_map *nodemap=NULL;
3558         struct ctdb_node_map *recmaster_nodemap=NULL;
3559         struct ctdb_node_map **remote_nodemaps=NULL;
3560         struct ctdb_vnn_map *vnnmap=NULL;
3561         struct ctdb_vnn_map *remote_vnnmap=NULL;
3562         int32_t debug_level;
3563         int i, j, ret;
3564         bool self_ban;
3565
3566
3567         /* verify that the main daemon is still running */
3568         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3569                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3570                 exit(-1);
3571         }
3572
3573         /* ping the local daemon to tell it we are alive */
3574         ctdb_ctrl_recd_ping(ctdb);
3575
3576         if (rec->election_timeout) {
3577                 /* an election is in progress */
3578                 return;
3579         }
3580
3581         /* read the debug level from the parent and update locally */
3582         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3583         if (ret !=0) {
3584                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3585                 return;
3586         }
3587         LogLevel = debug_level;
3588
3589         /* get relevant tunables */
3590         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3591         if (ret != 0) {
3592                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3593                 return;
3594         }
3595
3596         /* get the current recovery lock file from the server */
3597         if (update_recovery_lock_file(ctdb) != 0) {
3598                 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3599                 return;
3600         }
3601
3602         /* Make sure that if recovery lock verification becomes disabled when
3603            we close the file
3604         */
3605         if (ctdb->tunable.verify_recovery_lock == 0) {
3606                 if (ctdb->recovery_lock_fd != -1) {
3607                         close(ctdb->recovery_lock_fd);
3608                         ctdb->recovery_lock_fd = -1;
3609                 }
3610         }
3611
3612         pnn = ctdb_get_pnn(ctdb);
3613
3614         /* get the vnnmap */
3615         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3616         if (ret != 0) {
3617                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3618                 return;
3619         }
3620
3621
3622         /* get number of nodes */
3623         if (rec->nodemap) {
3624                 talloc_free(rec->nodemap);
3625                 rec->nodemap = NULL;
3626                 nodemap=NULL;
3627         }
3628         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3629         if (ret != 0) {
3630                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3631                 return;
3632         }
3633         nodemap = rec->nodemap;
3634
3635         /* remember our own node flags */
3636         rec->node_flags = nodemap->nodes[pnn].flags;
3637
3638         ban_misbehaving_nodes(rec, &self_ban);
3639         if (self_ban) {
3640                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3641                 return;
3642         }
3643
3644         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3645            also frozen and that the recmode is set to active.
3646         */
3647         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3648                 /* If this node has become inactive then we want to
3649                  * reduce the chances of it taking over the recovery
3650                  * master role when it becomes active again.  This
3651                  * helps to stabilise the recovery master role so that
3652                  * it stays on the most stable node.
3653                  */
3654                 rec->priority_time = timeval_current();
3655
3656                 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3657                 if (ret != 0) {
3658                         DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3659                 }
3660                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3661                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3662
3663                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3664                         if (ret != 0) {
3665                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3666
3667                                 return;
3668                         }
3669                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3670                         if (ret != 0) {
3671                                 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3672                                 return;
3673                         }
3674                 }
3675
3676                 /* If this node is stopped or banned then it is not the recovery
3677                  * master, so don't do anything. This prevents stopped or banned
3678                  * node from starting election and sending unnecessary controls.
3679                  */
3680                 return;
3681         }
3682
3683         /* check which node is the recovery master */
3684         ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3685         if (ret != 0) {
3686                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3687                 return;
3688         }
3689
3690         /* If we are not the recmaster then do some housekeeping */
3691         if (rec->recmaster != pnn) {
3692                 /* Ignore any IP reallocate requests - only recmaster
3693                  * processes them
3694                  */
3695                 TALLOC_FREE(rec->reallocate_requests);
3696                 /* Clear any nodes that should be force rebalanced in
3697                  * the next takeover run.  If the recovery master role
3698                  * has moved then we don't want to process these some
3699                  * time in the future.
3700                  */
3701                 TALLOC_FREE(rec->force_rebalance_nodes);
3702         }
3703
3704         /* This is a special case.  When recovery daemon is started, recmaster
3705          * is set to -1.  If a node is not started in stopped state, then
3706          * start election to decide recovery master
3707          */
3708         if (rec->recmaster == (uint32_t)-1) {
3709                 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3710                 force_election(rec, pnn, nodemap);
3711                 return;
3712         }
3713
3714         /* update the capabilities for all nodes */
3715         ret = update_capabilities(ctdb, nodemap);
3716         if (ret != 0) {
3717                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3718                 return;
3719         }
3720
3721         /*
3722          * If the current recmaster does not have CTDB_CAP_RECMASTER,
3723          * but we have, then force an election and try to become the new
3724          * recmaster.
3725          */
3726         if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
3727             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3728              !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3729                 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3730                                   " but we (node %u) have - force an election\n",
3731                                   rec->recmaster, pnn));
3732                 force_election(rec, pnn, nodemap);
3733                 return;
3734         }
3735
3736         /* count how many active nodes there are */
3737         rec->num_active    = 0;
3738         rec->num_lmasters  = 0;
3739         rec->num_connected = 0;
3740         for (i=0; i<nodemap->num; i++) {
3741                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3742                         rec->num_active++;
3743                         if (rec->ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER) {
3744                                 rec->num_lmasters++;
3745                         }
3746                 }
3747                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
3748                         rec->num_connected++;
3749                 }
3750         }
3751
3752
3753         /* verify that the recmaster node is still active */
3754         for (j=0; j<nodemap->num; j++) {
3755                 if (nodemap->nodes[j].pnn==rec->recmaster) {
3756                         break;
3757                 }
3758         }
3759
3760         if (j == nodemap->num) {
3761                 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3762                 force_election(rec, pnn, nodemap);
3763                 return;
3764         }
3765
3766         /* if recovery master is disconnected we must elect a new recmaster */
3767         if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3768                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3769                 force_election(rec, pnn, nodemap);
3770                 return;
3771         }
3772
3773         /* get nodemap from the recovery master to check if it is inactive */
3774         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3775                                    mem_ctx, &recmaster_nodemap);
3776         if (ret != 0) {
3777                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3778                           nodemap->nodes[j].pnn));
3779                 return;
3780         }
3781
3782
3783         if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3784             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3785                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3786                 /*
3787                  * update our nodemap to carry the recmaster's notion of
3788                  * its own flags, so that we don't keep freezing the
3789                  * inactive recmaster node...
3790                  */
3791                 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3792                 force_election(rec, pnn, nodemap);
3793                 return;
3794         }
3795
3796         /* verify that we have all ip addresses we should have and we dont
3797          * have addresses we shouldnt have.
3798          */
3799         if (ctdb->tunable.disable_ip_failover == 0 &&
3800             rec->takeover_runs_disable_ctx == NULL) {
3801                 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3802                         DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3803                 }
3804         }
3805
3806
3807         /* if we are not the recmaster then we do not need to check
3808            if recovery is needed
3809          */
3810         if (pnn != rec->recmaster) {
3811                 return;
3812         }
3813
3814
3815         /* ensure our local copies of flags are right */
3816         ret = update_local_flags(rec, nodemap);
3817         if (ret == MONITOR_ELECTION_NEEDED) {
3818                 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3819                 force_election(rec, pnn, nodemap);
3820                 return;
3821         }
3822         if (ret != MONITOR_OK) {
3823                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3824                 return;
3825         }
3826
3827         if (ctdb->num_nodes != nodemap->num) {
3828                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3829                 ctdb_load_nodes_file(ctdb);
3830                 return;
3831         }
3832
3833         /* verify that all active nodes agree that we are the recmaster */
3834         switch (verify_recmaster(rec, nodemap, pnn)) {
3835         case MONITOR_RECOVERY_NEEDED:
3836                 /* can not happen */
3837                 return;
3838         case MONITOR_ELECTION_NEEDED:
3839                 force_election(rec, pnn, nodemap);
3840                 return;
3841         case MONITOR_OK:
3842                 break;
3843         case MONITOR_FAILED:
3844                 return;
3845         }
3846
3847
3848         if (rec->need_recovery) {
3849                 /* a previous recovery didn't finish */
3850                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3851                 return;
3852         }
3853
3854         /* verify that all active nodes are in normal mode
3855            and not in recovery mode
3856         */
3857         switch (verify_recmode(ctdb, nodemap)) {
3858         case MONITOR_RECOVERY_NEEDED:
3859                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3860                 return;
3861         case MONITOR_FAILED:
3862                 return;
3863         case MONITOR_ELECTION_NEEDED:
3864                 /* can not happen */
3865         case MONITOR_OK:
3866                 break;
3867         }
3868
3869
3870         if (ctdb->tunable.verify_recovery_lock != 0) {
3871                 /* we should have the reclock - check its not stale */
3872                 ret = check_recovery_lock(ctdb);
3873                 if (ret != 0) {
3874                         DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
3875                         ctdb_set_culprit(rec, ctdb->pnn);
3876                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3877                         return;
3878                 }
3879         }
3880
3881
3882         /* if there are takeovers requested, perform it and notify the waiters */
3883         if (rec->takeover_runs_disable_ctx == NULL &&
3884             rec->reallocate_requests) {
3885                 process_ipreallocate_requests(ctdb, rec);
3886         }
3887
3888         /* get the nodemap for all active remote nodes
3889          */
3890         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3891         if (remote_nodemaps == NULL) {
3892                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3893                 return;
3894         }
3895         for(i=0; i<nodemap->num; i++) {
3896                 remote_nodemaps[i] = NULL;
3897         }
3898         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3899                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3900                 return;
3901         }
3902
3903         /* verify that all other nodes have the same nodemap as we have
3904         */
3905         for (j=0; j<nodemap->num; j++) {
3906                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3907                         continue;
3908                 }
3909
3910                 if (remote_nodemaps[j] == NULL) {
3911                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3912                         ctdb_set_culprit(rec, j);
3913
3914                         return;
3915                 }
3916
3917                 /* if the nodes disagree on how many nodes there are
3918                    then this is a good reason to try recovery
3919                  */
3920                 if (remote_nodemaps[j]->num != nodemap->num) {
3921                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3922                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3923                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3924                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3925                         return;
3926                 }
3927
3928                 /* if the nodes disagree on which nodes exist and are
3929                    active, then that is also a good reason to do recovery
3930                  */
3931                 for (i=0;i<nodemap->num;i++) {
3932                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3933                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3934                                           nodemap->nodes[j].pnn, i,
3935                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3936                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3937                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3938                                             vnnmap);
3939                                 return;
3940                         }
3941                 }
3942         }
3943
3944         /*
3945          * Update node flags obtained from each active node. This ensure we have
3946          * up-to-date information for all the nodes.
3947          */
3948         for (j=0; j<nodemap->num; j++) {
3949                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3950                         continue;
3951                 }
3952                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3953         }
3954
3955         for (j=0; j<nodemap->num; j++) {
3956                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3957                         continue;
3958                 }
3959
3960                 /* verify the flags are consistent
3961                 */
3962                 for (i=0; i<nodemap->num; i++) {
3963                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3964                                 continue;
3965                         }
3966
3967                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3968                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3969                                   nodemap->nodes[j].pnn,
3970                                   nodemap->nodes[i].pnn,
3971                                   remote_nodemaps[j]->nodes[i].flags,
3972                                   nodemap->nodes[i].flags));
3973                                 if (i == j) {
3974                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3975                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3976                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3977                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3978                                                     vnnmap);
3979                                         return;
3980                                 } else {
3981                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3982                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3983                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3984                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3985                                                     vnnmap);
3986                                         return;
3987                                 }
3988                         }
3989                 }
3990         }
3991
3992
3993         /* There must be the same number of lmasters in the vnn map as
3994          * there are active nodes with the lmaster capability...  or
3995          * do a recovery.
3996          */
3997         if (vnnmap->size != rec->num_lmasters) {
3998                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3999                           vnnmap->size, rec->num_lmasters));
4000                 ctdb_set_culprit(rec, ctdb->pnn);
4001                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4002                 return;
4003         }
4004
4005         /* verify that all active nodes in the nodemap also exist in
4006            the vnnmap.
4007          */
4008         for (j=0; j<nodemap->num; j++) {
4009                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
4010                         continue;
4011                 }
4012                 if (nodemap->nodes[j].pnn == pnn) {
4013                         continue;
4014                 }
4015
4016                 for (i=0; i<vnnmap->size; i++) {
4017                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
4018                                 break;
4019                         }
4020                 }
4021                 if (i == vnnmap->size) {
4022                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
4023                                   nodemap->nodes[j].pnn));
4024                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4025                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4026                         return;
4027                 }
4028         }
4029
4030
4031         /* verify that all other nodes have the same vnnmap
4032            and are from the same generation
4033          */
4034         for (j=0; j<nodemap->num; j++) {
4035                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
4036                         continue;
4037                 }
4038                 if (nodemap->nodes[j].pnn == pnn) {
4039                         continue;
4040                 }
4041
4042                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
4043                                           mem_ctx, &remote_vnnmap);
4044                 if (ret != 0) {
4045                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
4046                                   nodemap->nodes[j].pnn));
4047                         return;
4048                 }
4049
4050                 /* verify the vnnmap generation is the same */
4051                 if (vnnmap->generation != remote_vnnmap->generation) {
4052                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
4053                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
4054                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4055                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4056                         return;
4057                 }
4058
4059                 /* verify the vnnmap size is the same */
4060                 if (vnnmap->size != remote_vnnmap->size) {
4061                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
4062                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
4063                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4064                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4065                         return;
4066                 }
4067
4068                 /* verify the vnnmap is the same */
4069                 for (i=0;i<vnnmap->size;i++) {
4070                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
4071                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
4072                                           nodemap->nodes[j].pnn));
4073                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
4074                                 do_recovery(rec, mem_ctx, pnn, nodemap,
4075                                             vnnmap);
4076                                 return;
4077                         }
4078                 }
4079         }
4080
4081         /* we might need to change who has what IP assigned */
4082         if (rec->need_takeover_run) {
4083                 uint32_t culprit = (uint32_t)-1;
4084
4085                 rec->need_takeover_run = false;
4086
4087                 /* update the list of public ips that a node can handle for
4088                    all connected nodes
4089                 */
4090                 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
4091                 if (ret != 0) {
4092                         DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
4093                                          culprit));
4094                         rec->need_takeover_run = true;
4095                         return;
4096                 }
4097
4098                 /* execute the "startrecovery" event script on all nodes */
4099                 ret = run_startrecovery_eventscript(rec, nodemap);
4100                 if (ret!=0) {
4101                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
4102                         ctdb_set_culprit(rec, ctdb->pnn);
4103                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4104                         return;
4105                 }
4106
4107                 /* If takeover run fails, then the offending nodes are
4108                  * assigned ban culprit counts. And we re-try takeover.
4109                  * If takeover run fails repeatedly, the node would get
4110                  * banned.
4111                  *
4112                  * If rec->need_takeover_run is not set to true at this
4113                  * failure, monitoring is disabled cluster-wide (via
4114                  * startrecovery eventscript) and will not get enabled.
4115                  */
4116                 if (!do_takeover_run(rec, nodemap, true)) {
4117                         return;
4118                 }
4119
4120                 /* execute the "recovered" event script on all nodes */
4121                 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
4122 #if 0
4123 // we cant check whether the event completed successfully
4124 // since this script WILL fail if the node is in recovery mode
4125 // and if that race happens, the code here would just cause a second
4126 // cascading recovery.
4127                 if (ret!=0) {
4128                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
4129                         ctdb_set_culprit(rec, ctdb->pnn);
4130                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
4131                 }
4132 #endif
4133         }
4134 }
4135
4136 /*
4137   the main monitoring loop
4138  */
4139 static void monitor_cluster(struct ctdb_context *ctdb)
4140 {
4141         struct ctdb_recoverd *rec;
4142
4143         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
4144
4145         rec = talloc_zero(ctdb, struct ctdb_recoverd);
4146         CTDB_NO_MEMORY_FATAL(ctdb, rec);
4147
4148         rec->ctdb = ctdb;
4149
4150         rec->takeover_run_in_progress = false;
4151
4152         rec->priority_time = timeval_current();
4153
4154         /* register a message port for sending memory dumps */
4155         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4156
4157         /* register a message port for requesting logs */
4158         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
4159
4160         /* register a message port for clearing logs */
4161         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
4162
4163         /* register a message port for recovery elections */
4164         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4165
4166         /* when nodes are disabled/enabled */
4167         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4168
4169         /* when we are asked to puch out a flag change */
4170         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4171
4172         /* register a message port for vacuum fetch */
4173         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4174
4175         /* register a message port for reloadnodes  */
4176         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4177
4178         /* register a message port for performing a takeover run */
4179         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4180
4181         /* register a message port for disabling the ip check for a short while */
4182         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4183
4184         /* register a message port for updating the recovery daemons node assignment for an ip */
4185         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4186
4187         /* register a message port for forcing a rebalance of a node next
4188            reallocation */
4189         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4190
4191         /* Register a message port for disabling takeover runs */
4192         ctdb_client_set_message_handler(ctdb,
4193                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4194                                         disable_takeover_runs_handler, rec);
4195
4196         /* register a message port for detaching database */
4197         ctdb_client_set_message_handler(ctdb,
4198                                         CTDB_SRVID_DETACH_DATABASE,
4199                                         detach_database_handler, rec);
4200
4201         for (;;) {
4202                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4203                 struct timeval start;
4204                 double elapsed;
4205
4206                 if (!mem_ctx) {
4207                         DEBUG(DEBUG_CRIT,(__location__
4208                                           " Failed to create temp context\n"));
4209                         exit(-1);
4210                 }
4211
4212                 start = timeval_current();
4213                 main_loop(ctdb, rec, mem_ctx);
4214                 talloc_free(mem_ctx);
4215
4216                 /* we only check for recovery once every second */
4217                 elapsed = timeval_elapsed(&start);
4218                 if (elapsed < ctdb->tunable.recover_interval) {
4219                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4220                                           - elapsed);
4221                 }
4222         }
4223 }
4224
4225 /*
4226   event handler for when the main ctdbd dies
4227  */
4228 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4229                                  uint16_t flags, void *private_data)
4230 {
4231         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4232         _exit(1);
4233 }
4234
4235 /*
4236   called regularly to verify that the recovery daemon is still running
4237  */
4238 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4239                               struct timeval yt, void *p)
4240 {
4241         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4242
4243         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4244                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4245
4246                 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4247                                 ctdb_restart_recd, ctdb);
4248
4249                 return;
4250         }
4251
4252         event_add_timed(ctdb->ev, ctdb->recd_ctx,
4253                         timeval_current_ofs(30, 0),
4254                         ctdb_check_recd, ctdb);
4255 }
4256
4257 static void recd_sig_child_handler(struct event_context *ev,
4258         struct signal_event *se, int signum, int count,
4259         void *dont_care,
4260         void *private_data)
4261 {
4262 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4263         int status;
4264         pid_t pid = -1;
4265
4266         while (pid != 0) {
4267                 pid = waitpid(-1, &status, WNOHANG);
4268                 if (pid == -1) {
4269                         if (errno != ECHILD) {
4270                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4271                         }
4272                         return;
4273                 }
4274                 if (pid > 0) {
4275                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4276                 }
4277         }
4278 }
4279
4280 /*
4281   startup the recovery daemon as a child of the main ctdb daemon
4282  */
4283 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4284 {
4285         int fd[2];
4286         struct signal_event *se;
4287         struct tevent_fd *fde;
4288
4289         if (pipe(fd) != 0) {
4290                 return -1;
4291         }
4292
4293         ctdb->recoverd_pid = ctdb_fork_no_free_ringbuffer(ctdb);
4294         if (ctdb->recoverd_pid == -1) {
4295                 return -1;
4296         }
4297
4298         if (ctdb->recoverd_pid != 0) {
4299                 talloc_free(ctdb->recd_ctx);
4300                 ctdb->recd_ctx = talloc_new(ctdb);
4301                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4302
4303                 close(fd[0]);
4304                 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4305                                 timeval_current_ofs(30, 0),
4306                                 ctdb_check_recd, ctdb);
4307                 return 0;
4308         }
4309
4310         close(fd[1]);
4311
4312         srandom(getpid() ^ time(NULL));
4313
4314         /* Clear the log ringbuffer */
4315         ctdb_clear_log(ctdb);
4316
4317         ctdb_set_process_name("ctdb_recovered");
4318         if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4319                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4320                 exit(1);
4321         }
4322
4323         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4324
4325         fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4326                      ctdb_recoverd_parent, &fd[0]);
4327         tevent_fd_set_auto_close(fde);
4328
4329         /* set up a handler to pick up sigchld */
4330         se = event_add_signal(ctdb->ev, ctdb,
4331                                      SIGCHLD, 0,
4332                                      recd_sig_child_handler,
4333                                      ctdb);
4334         if (se == NULL) {
4335                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4336                 exit(1);
4337         }
4338
4339         monitor_cluster(ctdb);
4340
4341         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4342         return -1;
4343 }
4344
4345 /*
4346   shutdown the recovery daemon
4347  */
4348 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4349 {
4350         if (ctdb->recoverd_pid == 0) {
4351                 return;
4352         }
4353
4354         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4355         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4356
4357         TALLOC_FREE(ctdb->recd_ctx);
4358         TALLOC_FREE(ctdb->recd_ping_count);
4359 }
4360
4361 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4362                        struct timeval t, void *private_data)
4363 {
4364         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4365
4366         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4367         ctdb_stop_recoverd(ctdb);
4368         ctdb_start_recoverd(ctdb);
4369 }