ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "includes.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25 #include "popt.h"
  26 #include "cmdline.h"
  27 #include "../include/ctdb_client.h"
  28 #include "../include/ctdb_private.h"
  29 #include "lib/tdb_wrap/tdb_wrap.h"
  30 #include "lib/util/dlinklist.h"
  31
  32
  33 /* List of SRVID requests that need to be processed */
  34 struct srvid_list {
  35         struct srvid_list *next, *prev;
  36         struct srvid_request *request;
  37 };
  38
  39 struct srvid_requests {
  40         struct srvid_list *requests;
  41 };
  42
  43 static void srvid_request_reply(struct ctdb_context *ctdb,
  44                                 struct srvid_request *request,
  45                                 TDB_DATA result)
  46 {
  47         /* Someone that sent srvid==0 does not want a reply */
  48         if (request->srvid == 0) {
  49                 talloc_free(request);
  50                 return;
  51         }
  52
  53         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  54                                      result) == 0) {
  55                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  56                                   (unsigned)request->pnn,
  57                                   (unsigned long long)request->srvid));
  58         } else {
  59                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  60                                  (unsigned)request->pnn,
  61                                  (unsigned long long)request->srvid));
  62         }
  63
  64         talloc_free(request);
  65 }
  66
  67 static void srvid_requests_reply(struct ctdb_context *ctdb,
  68                                  struct srvid_requests **requests,
  69                                  TDB_DATA result)
  70 {
  71         struct srvid_list *r;
  72
  73         for (r = (*requests)->requests; r != NULL; r = r->next) {
  74                 srvid_request_reply(ctdb, r->request, result);
  75         }
  76
  77         /* Free the list structure... */
  78         TALLOC_FREE(*requests);
  79 }
  80
  81 static void srvid_request_add(struct ctdb_context *ctdb,
  82                               struct srvid_requests **requests,
  83                               struct srvid_request *request)
  84 {
  85         struct srvid_list *t;
  86         int32_t ret;
  87         TDB_DATA result;
  88
  89         if (*requests == NULL) {
  90                 *requests = talloc_zero(ctdb, struct srvid_requests);
  91                 if (*requests == NULL) {
  92                         goto nomem;
  93                 }
  94         }
  95
  96         t = talloc_zero(*requests, struct srvid_list);
  97         if (t == NULL) {
  98                 /* If *requests was just allocated above then free it */
  99                 if ((*requests)->requests == NULL) {
 100                         TALLOC_FREE(*requests);
 101                 }
 102                 goto nomem;
 103         }
 104
 105         t->request = (struct srvid_request *)talloc_steal(t, request);
 106         DLIST_ADD((*requests)->requests, t);
 107
 108         return;
 109
 110 nomem:
 111         /* Failed to add the request to the list.  Send a fail. */
 112         DEBUG(DEBUG_ERR, (__location__
 113                           " Out of memory, failed to queue SRVID request\n"));
 114         ret = -ENOMEM;
 115         result.dsize = sizeof(ret);
 116         result.dptr = (uint8_t *)&ret;
 117         srvid_request_reply(ctdb, request, result);
 118 }
 119
 120 /* An abstraction to allow an operation (takeover runs, recoveries,
 121  * ...) to be disabled for a given timeout */
 122 struct ctdb_op_state {
 123         struct tevent_timer *timer;
 124         bool in_progress;
 125         const char *name;
 126 };
 127
 128 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
 129 {
 130         struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
 131
 132         if (state != NULL) {
 133                 state->in_progress = false;
 134                 state->name = name;
 135         }
 136
 137         return state;
 138 }
 139
 140 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
 141 {
 142         return state->timer != NULL;
 143 }
 144
 145 static bool ctdb_op_begin(struct ctdb_op_state *state)
 146 {
 147         if (ctdb_op_is_disabled(state)) {
 148                 DEBUG(DEBUG_NOTICE,
 149                       ("Unable to begin - %s are disabled\n", state->name));
 150                 return false;
 151         }
 152
 153         state->in_progress = true;
 154         return true;
 155 }
 156
 157 static bool ctdb_op_end(struct ctdb_op_state *state)
 158 {
 159         return state->in_progress = false;
 160 }
 161
 162 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
 163 {
 164         return state->in_progress;
 165 }
 166
 167 static void ctdb_op_enable(struct ctdb_op_state *state)
 168 {
 169         TALLOC_FREE(state->timer);
 170 }
 171
 172 static void ctdb_op_timeout_handler(struct event_context *ev,
 173                                     struct timed_event *te,
 174                                     struct timeval yt, void *p)
 175 {
 176         struct ctdb_op_state *state =
 177                 talloc_get_type(p, struct ctdb_op_state);
 178
 179         DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
 180         ctdb_op_enable(state);
 181 }
 182
 183 static int ctdb_op_disable(struct ctdb_op_state *state,
 184                            struct tevent_context *ev,
 185                            uint32_t timeout)
 186 {
 187         if (timeout == 0) {
 188                 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
 189                 ctdb_op_enable(state);
 190                 return 0;
 191         }
 192
 193         if (state->in_progress) {
 194                 DEBUG(DEBUG_ERR,
 195                       ("Unable to disable %s - in progress\n", state->name));
 196                 return -EAGAIN;
 197         }
 198
 199         DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
 200                             state->name, timeout));
 201
 202         /* Clear any old timers */
 203         talloc_free(state->timer);
 204
 205         /* Arrange for the timeout to occur */
 206         state->timer = tevent_add_timer(ev, state,
 207                                         timeval_current_ofs(timeout, 0),
 208                                         ctdb_op_timeout_handler, state);
 209         if (state->timer == NULL) {
 210                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
 211                 return -ENOMEM;
 212         }
 213
 214         return 0;
 215 }
 216
 217 struct ctdb_banning_state {
 218         uint32_t count;
 219         struct timeval last_reported_time;
 220 };
 221
 222 /*
 223   private state of recovery daemon
 224  */
 225 struct ctdb_recoverd {
 226         struct ctdb_context *ctdb;
 227         uint32_t recmaster;
 228         uint32_t last_culprit_node;
 229         struct ctdb_node_map *nodemap;
 230         struct timeval priority_time;
 231         bool need_takeover_run;
 232         bool need_recovery;
 233         uint32_t node_flags;
 234         struct timed_event *send_election_te;
 235         struct timed_event *election_timeout;
 236         struct vacuum_info *vacuum_info;
 237         struct srvid_requests *reallocate_requests;
 238         struct ctdb_op_state *takeover_run;
 239         struct ctdb_op_state *recovery;
 240         struct ctdb_control_get_ifaces *ifaces;
 241         uint32_t *force_rebalance_nodes;
 242         struct ctdb_node_capabilities *caps;
 243 };
 244
 245 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 246 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 247
 248 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
 249
 250 /*
 251   ban a node for a period of time
 252  */
 253 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 254 {
 255         int ret;
 256         struct ctdb_context *ctdb = rec->ctdb;
 257         struct ctdb_ban_time bantime;
 258
 259         if (!ctdb_validate_pnn(ctdb, pnn)) {
 260                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 261                 return;
 262         }
 263
 264         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 265
 266         bantime.pnn  = pnn;
 267         bantime.time = ban_time;
 268
 269         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 270         if (ret != 0) {
 271                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 272                 return;
 273         }
 274
 275 }
 276
 277 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 278
 279
 280 /*
 281   remember the trouble maker
 282  */
 283 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 284 {
 285         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 286         struct ctdb_banning_state *ban_state;
 287
 288         if (culprit > ctdb->num_nodes) {
 289                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 290                 return;
 291         }
 292
 293         /* If we are banned or stopped, do not set other nodes as culprits */
 294         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 295                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 296                 return;
 297         }
 298
 299         if (ctdb->nodes[culprit]->ban_state == NULL) {
 300                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 301                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 302
 303
 304         }
 305         ban_state = ctdb->nodes[culprit]->ban_state;
 306         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 307                 /* this was the first time in a long while this node
 308                    misbehaved so we will forgive any old transgressions.
 309                 */
 310                 ban_state->count = 0;
 311         }
 312
 313         ban_state->count += count;
 314         ban_state->last_reported_time = timeval_current();
 315         rec->last_culprit_node = culprit;
 316 }
 317
 318 /*
 319   remember the trouble maker
 320  */
 321 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 322 {
 323         ctdb_set_culprit_count(rec, culprit, 1);
 324 }
 325
 326
 327 /* this callback is called for every node that failed to execute the
 328    recovered event
 329 */
 330 static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 331 {
 332         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 333
 334         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
 335
 336         ctdb_set_culprit(rec, node_pnn);
 337 }
 338
 339 /*
 340   run the "recovered" eventscript on all nodes
 341  */
 342 static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
 343 {
 344         TALLOC_CTX *tmp_ctx;
 345         uint32_t *nodes;
 346         struct ctdb_context *ctdb = rec->ctdb;
 347
 348         tmp_ctx = talloc_new(ctdb);
 349         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 350
 351         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 352         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
 353                                         nodes, 0,
 354                                         CONTROL_TIMEOUT(), false, tdb_null,
 355                                         NULL, recovered_fail_callback,
 356                                         rec) != 0) {
 357                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
 358
 359                 talloc_free(tmp_ctx);
 360                 return -1;
 361         }
 362
 363         talloc_free(tmp_ctx);
 364         return 0;
 365 }
 366
 367 /* this callback is called for every node that failed to execute the
 368    start recovery event
 369 */
 370 static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 371 {
 372         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 373
 374         DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
 375
 376         ctdb_set_culprit(rec, node_pnn);
 377 }
 378
 379 /*
 380   run the "startrecovery" eventscript on all nodes
 381  */
 382 static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
 383 {
 384         TALLOC_CTX *tmp_ctx;
 385         uint32_t *nodes;
 386         struct ctdb_context *ctdb = rec->ctdb;
 387
 388         tmp_ctx = talloc_new(ctdb);
 389         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 390
 391         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 392         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
 393                                         nodes, 0,
 394                                         CONTROL_TIMEOUT(), false, tdb_null,
 395                                         NULL,
 396                                         startrecovery_fail_callback,
 397                                         rec) != 0) {
 398                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
 399                 talloc_free(tmp_ctx);
 400                 return -1;
 401         }
 402
 403         talloc_free(tmp_ctx);
 404         return 0;
 405 }
 406
 407 /*
 408   update the node capabilities for all connected nodes
 409  */
 410 static int update_capabilities(struct ctdb_recoverd *rec,
 411                                struct ctdb_node_map *nodemap)
 412 {
 413         uint32_t *capp;
 414         TALLOC_CTX *tmp_ctx;
 415         struct ctdb_node_capabilities *caps;
 416         struct ctdb_context *ctdb = rec->ctdb;
 417
 418         tmp_ctx = talloc_new(rec);
 419         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 420
 421         caps = ctdb_get_capabilities(ctdb, tmp_ctx,
 422                                      CONTROL_TIMEOUT(), nodemap);
 423
 424         if (caps == NULL) {
 425                 DEBUG(DEBUG_ERR,
 426                       (__location__ " Failed to get node capabilities\n"));
 427                 talloc_free(tmp_ctx);
 428                 return -1;
 429         }
 430
 431         capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
 432         if (capp == NULL) {
 433                 DEBUG(DEBUG_ERR,
 434                       (__location__
 435                        " Capabilities don't include current node.\n"));
 436                 talloc_free(tmp_ctx);
 437                 return -1;
 438         }
 439         ctdb->capabilities = *capp;
 440
 441         TALLOC_FREE(rec->caps);
 442         rec->caps = talloc_steal(rec, caps);
 443
 444         talloc_free(tmp_ctx);
 445         return 0;
 446 }
 447
 448 static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 449 {
 450         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 451
 452         DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 453         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 454 }
 455
 456 static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 457 {
 458         struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
 459
 460         DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
 461         ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
 462 }
 463
 464 /*
 465   change recovery mode on all nodes
 466  */
 467 static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
 468 {
 469         TDB_DATA data;
 470         uint32_t *nodes;
 471         TALLOC_CTX *tmp_ctx;
 472
 473         tmp_ctx = talloc_new(ctdb);
 474         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 475
 476         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 477
 478         data.dsize = sizeof(uint32_t);
 479         data.dptr = (unsigned char *)&rec_mode;
 480
 481         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 482                                         nodes, 0,
 483                                         CONTROL_TIMEOUT(),
 484                                         false, data,
 485                                         NULL, NULL,
 486                                         NULL) != 0) {
 487                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 488                 talloc_free(tmp_ctx);
 489                 return -1;
 490         }
 491
 492         /* freeze all nodes */
 493         if (rec_mode == CTDB_RECOVERY_ACTIVE) {
 494                 int i;
 495
 496                 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
 497                         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
 498                                                 nodes, i,
 499                                                 CONTROL_TIMEOUT(),
 500                                                 false, tdb_null,
 501                                                 NULL,
 502                                                 set_recmode_fail_callback,
 503                                                 rec) != 0) {
 504                                 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
 505                                 talloc_free(tmp_ctx);
 506                                 return -1;
 507                         }
 508                 }
 509         }
 510
 511         talloc_free(tmp_ctx);
 512         return 0;
 513 }
 514
 515 /*
 516   change recovery master on all node
 517  */
 518 static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
 519 {
 520         TDB_DATA data;
 521         TALLOC_CTX *tmp_ctx;
 522         uint32_t *nodes;
 523
 524         tmp_ctx = talloc_new(ctdb);
 525         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 526
 527         data.dsize = sizeof(uint32_t);
 528         data.dptr = (unsigned char *)&pnn;
 529
 530         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 531         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
 532                                         nodes, 0,
 533                                         CONTROL_TIMEOUT(), false, data,
 534                                         NULL, NULL,
 535                                         NULL) != 0) {
 536                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
 537                 talloc_free(tmp_ctx);
 538                 return -1;
 539         }
 540
 541         talloc_free(tmp_ctx);
 542         return 0;
 543 }
 544
 545 /* update all remote nodes to use the same db priority that we have
 546    this can fail if the remove node has not yet been upgraded to
 547    support this function, so we always return success and never fail
 548    a recovery if this call fails.
 549 */
 550 static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
 551         struct ctdb_node_map *nodemap,
 552         uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 553 {
 554         int db;
 555
 556         /* step through all local databases */
 557         for (db=0; db<dbmap->num;db++) {
 558                 struct ctdb_db_priority db_prio;
 559                 int ret;
 560
 561                 db_prio.db_id     = dbmap->dbs[db].dbid;
 562                 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
 563                 if (ret != 0) {
 564                         DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
 565                         continue;
 566                 }
 567
 568                 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority));
 569
 570                 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
 571                                                 CTDB_CURRENT_NODE, &db_prio);
 572                 if (ret != 0) {
 573                         DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
 574                                          db_prio.db_id));
 575                 }
 576         }
 577
 578         return 0;
 579 }
 580
 581 /*
 582   ensure all other nodes have attached to any databases that we have
 583  */
 584 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 585                                            uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
 586 {
 587         int i, j, db, ret;
 588         struct ctdb_dbid_map *remote_dbmap;
 589
 590         /* verify that all other nodes have all our databases */
 591         for (j=0; j<nodemap->num; j++) {
 592                 /* we dont need to ourself ourselves */
 593                 if (nodemap->nodes[j].pnn == pnn) {
 594                         continue;
 595                 }
 596                 /* dont check nodes that are unavailable */
 597                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 598                         continue;
 599                 }
 600
 601                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 602                                          mem_ctx, &remote_dbmap);
 603                 if (ret != 0) {
 604                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 605                         return -1;
 606                 }
 607
 608                 /* step through all local databases */
 609                 for (db=0; db<dbmap->num;db++) {
 610                         const char *name;
 611
 612
 613                         for (i=0;i<remote_dbmap->num;i++) {
 614                                 if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
 615                                         break;
 616                                 }
 617                         }
 618                         /* the remote node already have this database */
 619                         if (i!=remote_dbmap->num) {
 620                                 continue;
 621                         }
 622                         /* ok so we need to create this database */
 623                         ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
 624                                                   dbmap->dbs[db].dbid, mem_ctx,
 625                                                   &name);
 626                         if (ret != 0) {
 627                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 628                                 return -1;
 629                         }
 630                         ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
 631                                                  nodemap->nodes[j].pnn,
 632                                                  mem_ctx, name,
 633                                                  dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 634                         if (ret != 0) {
 635                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 636                                 return -1;
 637                         }
 638                 }
 639         }
 640
 641         return 0;
 642 }
 643
 644
 645 /*
 646   ensure we are attached to any databases that anyone else is attached to
 647  */
 648 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 649                                           uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
 650 {
 651         int i, j, db, ret;
 652         struct ctdb_dbid_map *remote_dbmap;
 653
 654         /* verify that we have all database any other node has */
 655         for (j=0; j<nodemap->num; j++) {
 656                 /* we dont need to ourself ourselves */
 657                 if (nodemap->nodes[j].pnn == pnn) {
 658                         continue;
 659                 }
 660                 /* dont check nodes that are unavailable */
 661                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 662                         continue;
 663                 }
 664
 665                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 666                                          mem_ctx, &remote_dbmap);
 667                 if (ret != 0) {
 668                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 669                         return -1;
 670                 }
 671
 672                 /* step through all databases on the remote node */
 673                 for (db=0; db<remote_dbmap->num;db++) {
 674                         const char *name;
 675
 676                         for (i=0;i<(*dbmap)->num;i++) {
 677                                 if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
 678                                         break;
 679                                 }
 680                         }
 681                         /* we already have this db locally */
 682                         if (i!=(*dbmap)->num) {
 683                                 continue;
 684                         }
 685                         /* ok so we need to create this database and
 686                            rebuild dbmap
 687                          */
 688                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 689                                             remote_dbmap->dbs[db].dbid, mem_ctx, &name);
 690                         if (ret != 0) {
 691                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 692                                           nodemap->nodes[j].pnn));
 693                                 return -1;
 694                         }
 695                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
 696                                            remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
 697                         if (ret != 0) {
 698                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 699                                 return -1;
 700                         }
 701                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 702                         if (ret != 0) {
 703                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 704                                 return -1;
 705                         }
 706                 }
 707         }
 708
 709         return 0;
 710 }
 711
 712
 713 /*
 714   pull the remote database contents from one node into the recdb
 715  */
 716 static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 717                                     struct tdb_wrap *recdb, uint32_t dbid)
 718 {
 719         int ret;
 720         TDB_DATA outdata;
 721         struct ctdb_marshall_buffer *reply;
 722         struct ctdb_rec_data *recdata;
 723         int i;
 724         TALLOC_CTX *tmp_ctx = talloc_new(recdb);
 725
 726         ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
 727                                CONTROL_TIMEOUT(), &outdata);
 728         if (ret != 0) {
 729                 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
 730                 talloc_free(tmp_ctx);
 731                 return -1;
 732         }
 733
 734         reply = (struct ctdb_marshall_buffer *)outdata.dptr;
 735
 736         if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
 737                 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
 738                 talloc_free(tmp_ctx);
 739                 return -1;
 740         }
 741
 742         recdata = (struct ctdb_rec_data *)&reply->data[0];
 743
 744         for (i=0;
 745              i<reply->count;
 746              recdata = (struct ctdb_rec_data *)(recdata->length + (uint8_t *)recdata), i++) {
 747                 TDB_DATA key, data;
 748                 struct ctdb_ltdb_header *hdr;
 749                 TDB_DATA existing;
 750
 751                 key.dptr = &recdata->data[0];
 752                 key.dsize = recdata->keylen;
 753                 data.dptr = &recdata->data[key.dsize];
 754                 data.dsize = recdata->datalen;
 755
 756                 hdr = (struct ctdb_ltdb_header *)data.dptr;
 757
 758                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 759                         DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
 760                         talloc_free(tmp_ctx);
 761                         return -1;
 762                 }
 763
 764                 /* fetch the existing record, if any */
 765                 existing = tdb_fetch(recdb->tdb, key);
 766
 767                 if (existing.dptr != NULL) {
 768                         struct ctdb_ltdb_header header;
 769                         if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
 770                                 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
 771                                          (unsigned)existing.dsize, srcnode));
 772                                 free(existing.dptr);
 773                                 talloc_free(tmp_ctx);
 774                                 return -1;
 775                         }
 776                         header = *(struct ctdb_ltdb_header *)existing.dptr;
 777                         free(existing.dptr);
 778                         if (!(header.rsn < hdr->rsn ||
 779                               (header.dmaster != ctdb_get_pnn(ctdb) &&
 780                                header.rsn == hdr->rsn))) {
 781                                 continue;
 782                         }
 783                 }
 784
 785                 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
 786                         DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
 787                         talloc_free(tmp_ctx);
 788                         return -1;
 789                 }
 790         }
 791
 792         talloc_free(tmp_ctx);
 793
 794         return 0;
 795 }
 796
 797
 798 struct pull_seqnum_cbdata {
 799         int failed;
 800         uint32_t pnn;
 801         uint64_t seqnum;
 802 };
 803
 804 static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 805 {
 806         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 807         uint64_t seqnum;
 808
 809         if (cb_data->failed != 0) {
 810                 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
 811                 return;
 812         }
 813
 814         if (res != 0) {
 815                 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
 816                 cb_data->failed = 1;
 817                 return;
 818         }
 819
 820         if (outdata.dsize != sizeof(uint64_t)) {
 821                 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
 822                 cb_data->failed = -1;
 823                 return;
 824         }
 825
 826         seqnum = *((uint64_t *)outdata.dptr);
 827
 828         if (seqnum > cb_data->seqnum ||
 829             (cb_data->pnn == -1 && seqnum == 0)) {
 830                 cb_data->seqnum = seqnum;
 831                 cb_data->pnn = node_pnn;
 832         }
 833 }
 834
 835 static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
 836 {
 837         struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
 838
 839         DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
 840         cb_data->failed = 1;
 841 }
 842
 843 static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
 844                                 struct ctdb_recoverd *rec,
 845                                 struct ctdb_node_map *nodemap,
 846                                 struct tdb_wrap *recdb, uint32_t dbid)
 847 {
 848         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
 849         uint32_t *nodes;
 850         TDB_DATA data;
 851         uint32_t outdata[2];
 852         struct pull_seqnum_cbdata *cb_data;
 853
 854         DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
 855
 856         outdata[0] = dbid;
 857         outdata[1] = 0;
 858
 859         data.dsize = sizeof(outdata);
 860         data.dptr  = (uint8_t *)&outdata[0];
 861
 862         cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
 863         if (cb_data == NULL) {
 864                 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
 865                 talloc_free(tmp_ctx);
 866                 return -1;
 867         }
 868
 869         cb_data->failed = 0;
 870         cb_data->pnn    = -1;
 871         cb_data->seqnum = 0;
 872
 873         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 874         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
 875                                         nodes, 0,
 876                                         CONTROL_TIMEOUT(), false, data,
 877                                         pull_seqnum_cb,
 878                                         pull_seqnum_fail_cb,
 879                                         cb_data) != 0) {
 880                 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
 881
 882                 talloc_free(tmp_ctx);
 883                 return -1;
 884         }
 885
 886         if (cb_data->failed != 0) {
 887                 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
 888                 talloc_free(tmp_ctx);
 889                 return -1;
 890         }
 891
 892         if (cb_data->pnn == -1) {
 893                 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
 894                 talloc_free(tmp_ctx);
 895                 return -1;
 896         }
 897
 898         DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
 899
 900         if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
 901                 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
 902                 talloc_free(tmp_ctx);
 903                 return -1;
 904         }
 905
 906         talloc_free(tmp_ctx);
 907         return 0;
 908 }
 909
 910
 911 /*
 912   pull all the remote database contents into the recdb
 913  */
 914 static int pull_remote_database(struct ctdb_context *ctdb,
 915                                 struct ctdb_recoverd *rec,
 916                                 struct ctdb_node_map *nodemap,
 917                                 struct tdb_wrap *recdb, uint32_t dbid,
 918                                 bool persistent)
 919 {
 920         int j;
 921
 922         if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
 923                 int ret;
 924                 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
 925                 if (ret == 0) {
 926                         return 0;
 927                 }
 928         }
 929
 930         /* pull all records from all other nodes across onto this node
 931            (this merges based on rsn)
 932         */
 933         for (j=0; j<nodemap->num; j++) {
 934                 /* dont merge from nodes that are unavailable */
 935                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 936                         continue;
 937                 }
 938                 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
 939                         DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
 940                                  nodemap->nodes[j].pnn));
 941                         ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
 942                         return -1;
 943                 }
 944         }
 945
 946         return 0;
 947 }
 948
 949
 950 /*
 951   update flags on all active nodes
 952  */
 953 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
 954 {
 955         int ret;
 956
 957         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 958                 if (ret != 0) {
 959                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 960                 return -1;
 961         }
 962
 963         return 0;
 964 }
 965
 966 /*
 967   ensure all nodes have the same vnnmap we do
 968  */
 969 static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
 970                                       uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
 971 {
 972         int j, ret;
 973
 974         /* push the new vnn map out to all the nodes */
 975         for (j=0; j<nodemap->num; j++) {
 976                 /* dont push to nodes that are unavailable */
 977                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 978                         continue;
 979                 }
 980
 981                 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
 982                 if (ret != 0) {
 983                         DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
 984                         return -1;
 985                 }
 986         }
 987
 988         return 0;
 989 }
 990
 991
 992 struct vacuum_info {
 993         struct vacuum_info *next, *prev;
 994         struct ctdb_recoverd *rec;
 995         uint32_t srcnode;
 996         struct ctdb_db_context *ctdb_db;
 997         struct ctdb_marshall_buffer *recs;
 998         struct ctdb_rec_data *r;
 999 };
1000
1001
1002 /*
1003   called when a vacuum fetch has completed - just free it and do the next one
1004  */
1005 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
1006 {
1007         talloc_free(state);
1008 }
1009
1010
1011 /*
1012   process the next element from the vacuum list
1013 */
1014 static void vacuum_fetch_next(struct vacuum_info *v)
1015 {
1016         struct ctdb_call call;
1017         struct ctdb_rec_data *r;
1018
1019         while (v->recs->count) {
1020                 struct ctdb_client_call_state *state;
1021                 TDB_DATA data;
1022                 struct ctdb_ltdb_header *hdr;
1023
1024                 ZERO_STRUCT(call);
1025                 call.call_id = CTDB_NULL_FUNC;
1026                 call.flags = CTDB_IMMEDIATE_MIGRATION;
1027                 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1028
1029                 r = v->r;
1030                 v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
1031                 v->recs->count--;
1032
1033                 call.key.dptr = &r->data[0];
1034                 call.key.dsize = r->keylen;
1035
1036                 /* ensure we don't block this daemon - just skip a record if we can't get
1037                    the chainlock */
1038                 if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
1039                         continue;
1040                 }
1041
1042                 data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
1043                 if (data.dptr == NULL) {
1044                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1045                         continue;
1046                 }
1047
1048                 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1049                         free(data.dptr);
1050                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1051                         continue;
1052                 }
1053
1054                 hdr = (struct ctdb_ltdb_header *)data.dptr;
1055                 if (hdr->dmaster == v->rec->ctdb->pnn) {
1056                         /* its already local */
1057                         free(data.dptr);
1058                         tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1059                         continue;
1060                 }
1061
1062                 free(data.dptr);
1063
1064                 state = ctdb_call_send(v->ctdb_db, &call);
1065                 tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
1066                 if (state == NULL) {
1067                         DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1068                         talloc_free(v);
1069                         return;
1070                 }
1071                 state->async.fn = vacuum_fetch_callback;
1072                 state->async.private_data = NULL;
1073         }
1074
1075         talloc_free(v);
1076 }
1077
1078
1079 /*
1080   destroy a vacuum info structure
1081  */
1082 static int vacuum_info_destructor(struct vacuum_info *v)
1083 {
1084         DLIST_REMOVE(v->rec->vacuum_info, v);
1085         return 0;
1086 }
1087
1088
1089 /*
1090   handler for vacuum fetch
1091 */
1092 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
1093                                  TDB_DATA data, void *private_data)
1094 {
1095         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
1096         struct ctdb_marshall_buffer *recs;
1097         int ret, i;
1098         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1099         const char *name;
1100         struct ctdb_dbid_map *dbmap=NULL;
1101         bool persistent = false;
1102         struct ctdb_db_context *ctdb_db;
1103         struct ctdb_rec_data *r;
1104         uint32_t srcnode;
1105         struct vacuum_info *v;
1106
1107         recs = (struct ctdb_marshall_buffer *)data.dptr;
1108         r = (struct ctdb_rec_data *)&recs->data[0];
1109
1110         if (recs->count == 0) {
1111                 talloc_free(tmp_ctx);
1112                 return;
1113         }
1114
1115         srcnode = r->reqid;
1116
1117         for (v=rec->vacuum_info;v;v=v->next) {
1118                 if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
1119                         /* we're already working on records from this node */
1120                         talloc_free(tmp_ctx);
1121                         return;
1122                 }
1123         }
1124
1125         /* work out if the database is persistent */
1126         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1127         if (ret != 0) {
1128                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1129                 talloc_free(tmp_ctx);
1130                 return;
1131         }
1132
1133         for (i=0;i<dbmap->num;i++) {
1134                 if (dbmap->dbs[i].dbid == recs->db_id) {
1135                         persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1136                         break;
1137                 }
1138         }
1139         if (i == dbmap->num) {
1140                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1141                 talloc_free(tmp_ctx);
1142                 return;
1143         }
1144
1145         /* find the name of this database */
1146         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1147                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1148                 talloc_free(tmp_ctx);
1149                 return;
1150         }
1151
1152         /* attach to it */
1153         ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1154         if (ctdb_db == NULL) {
1155                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1156                 talloc_free(tmp_ctx);
1157                 return;
1158         }
1159
1160         v = talloc_zero(rec, struct vacuum_info);
1161         if (v == NULL) {
1162                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1163                 talloc_free(tmp_ctx);
1164                 return;
1165         }
1166
1167         v->rec = rec;
1168         v->srcnode = srcnode;
1169         v->ctdb_db = ctdb_db;
1170         v->recs = talloc_memdup(v, recs, data.dsize);
1171         if (v->recs == NULL) {
1172                 DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
1173                 talloc_free(v);
1174                 talloc_free(tmp_ctx);
1175                 return;
1176         }
1177         v->r =  (struct ctdb_rec_data *)&v->recs->data[0];
1178
1179         DLIST_ADD(rec->vacuum_info, v);
1180
1181         talloc_set_destructor(v, vacuum_info_destructor);
1182
1183         vacuum_fetch_next(v);
1184         talloc_free(tmp_ctx);
1185 }
1186
1187
1188 /*
1189  * handler for database detach
1190  */
1191 static void detach_database_handler(struct ctdb_context *ctdb, uint64_t srvid,
1192                                     TDB_DATA data, void *private_data)
1193 {
1194         struct ctdb_recoverd *rec = talloc_get_type(private_data,
1195                                                     struct ctdb_recoverd);
1196         uint32_t db_id;
1197         struct vacuum_info *v, *vnext;
1198         struct ctdb_db_context *ctdb_db;
1199
1200         if (data.dsize != sizeof(db_id)) {
1201                 return;
1202         }
1203         db_id = *(uint32_t *)data.dptr;
1204
1205         ctdb_db = find_ctdb_db(ctdb, db_id);
1206         if (ctdb_db == NULL) {
1207                 /* database is not attached */
1208                 return;
1209         }
1210
1211         /* Stop any active vacuum fetch */
1212         v = rec->vacuum_info;
1213         while (v != NULL) {
1214                 vnext = v->next;
1215
1216                 if (v->ctdb_db->db_id == db_id) {
1217                         talloc_free(v);
1218                 }
1219                 v = vnext;
1220         }
1221
1222         DLIST_REMOVE(ctdb->db_list, ctdb_db);
1223
1224         DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1225                              ctdb_db->db_name));
1226         talloc_free(ctdb_db);
1227 }
1228
1229 /*
1230   called when ctdb_wait_timeout should finish
1231  */
1232 static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te,
1233                               struct timeval yt, void *p)
1234 {
1235         uint32_t *timed_out = (uint32_t *)p;
1236         (*timed_out) = 1;
1237 }
1238
1239 /*
1240   wait for a given number of seconds
1241  */
1242 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1243 {
1244         uint32_t timed_out = 0;
1245         time_t usecs = (secs - (time_t)secs) * 1000000;
1246         event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
1247         while (!timed_out) {
1248                 event_loop_once(ctdb->ev);
1249         }
1250 }
1251
1252 /*
1253   called when an election times out (ends)
1254  */
1255 static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te,
1256                                   struct timeval t, void *p)
1257 {
1258         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1259         rec->election_timeout = NULL;
1260         fast_start = false;
1261
1262         DEBUG(DEBUG_WARNING,("Election period ended\n"));
1263 }
1264
1265
1266 /*
1267   wait for an election to finish. It finished election_timeout seconds after
1268   the last election packet is received
1269  */
1270 static void ctdb_wait_election(struct ctdb_recoverd *rec)
1271 {
1272         struct ctdb_context *ctdb = rec->ctdb;
1273         while (rec->election_timeout) {
1274                 event_loop_once(ctdb->ev);
1275         }
1276 }
1277
1278 /*
1279   Update our local flags from all remote connected nodes.
1280   This is only run when we are or we belive we are the recovery master
1281  */
1282 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
1283 {
1284         int j;
1285         struct ctdb_context *ctdb = rec->ctdb;
1286         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1287
1288         /* get the nodemap for all active remote nodes and verify
1289            they are the same as for this node
1290          */
1291         for (j=0; j<nodemap->num; j++) {
1292                 struct ctdb_node_map *remote_nodemap=NULL;
1293                 int ret;
1294
1295                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1296                         continue;
1297                 }
1298                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1299                         continue;
1300                 }
1301
1302                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1303                                            mem_ctx, &remote_nodemap);
1304                 if (ret != 0) {
1305                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1306                                   nodemap->nodes[j].pnn));
1307                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1308                         talloc_free(mem_ctx);
1309                         return MONITOR_FAILED;
1310                 }
1311                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1312                         /* We should tell our daemon about this so it
1313                            updates its flags or else we will log the same
1314                            message again in the next iteration of recovery.
1315                            Since we are the recovery master we can just as
1316                            well update the flags on all nodes.
1317                         */
1318                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1319                         if (ret != 0) {
1320                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1321                                 return -1;
1322                         }
1323
1324                         /* Update our local copy of the flags in the recovery
1325                            daemon.
1326                         */
1327                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1328                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1329                                  nodemap->nodes[j].flags));
1330                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1331                 }
1332                 talloc_free(remote_nodemap);
1333         }
1334         talloc_free(mem_ctx);
1335         return MONITOR_OK;
1336 }
1337
1338
1339 /* Create a new random generation ip.
1340    The generation id can not be the INVALID_GENERATION id
1341 */
1342 static uint32_t new_generation(void)
1343 {
1344         uint32_t generation;
1345
1346         while (1) {
1347                 generation = random();
1348
1349                 if (generation != INVALID_GENERATION) {
1350                         break;
1351                 }
1352         }
1353
1354         return generation;
1355 }
1356
1357
1358 /*
1359   create a temporary working database
1360  */
1361 static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1362 {
1363         char *name;
1364         struct tdb_wrap *recdb;
1365         unsigned tdb_flags;
1366
1367         /* open up the temporary recovery database */
1368         name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1369                                ctdb->db_directory_state,
1370                                ctdb->pnn);
1371         if (name == NULL) {
1372                 return NULL;
1373         }
1374         unlink(name);
1375
1376         tdb_flags = TDB_NOLOCK;
1377         if (ctdb->valgrinding) {
1378                 tdb_flags |= TDB_NOMMAP;
1379         }
1380         tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1381
1382         recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1383                               tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1384         if (recdb == NULL) {
1385                 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1386         }
1387
1388         talloc_free(name);
1389
1390         return recdb;
1391 }
1392
1393
1394 /*
1395    a traverse function for pulling all relevant records from recdb
1396  */
1397 struct recdb_data {
1398         struct ctdb_context *ctdb;
1399         struct ctdb_marshall_buffer *recdata;
1400         uint32_t len;
1401         uint32_t allocated_len;
1402         bool failed;
1403         bool persistent;
1404 };
1405
1406 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1407 {
1408         struct recdb_data *params = (struct recdb_data *)p;
1409         struct ctdb_rec_data *recdata;
1410         struct ctdb_ltdb_header *hdr;
1411
1412         /*
1413          * skip empty records - but NOT for persistent databases:
1414          *
1415          * The record-by-record mode of recovery deletes empty records.
1416          * For persistent databases, this can lead to data corruption
1417          * by deleting records that should be there:
1418          *
1419          * - Assume the cluster has been running for a while.
1420          *
1421          * - A record R in a persistent database has been created and
1422          *   deleted a couple of times, the last operation being deletion,
1423          *   leaving an empty record with a high RSN, say 10.
1424          *
1425          * - Now a node N is turned off.
1426          *
1427          * - This leaves the local database copy of D on N with the empty
1428          *   copy of R and RSN 10. On all other nodes, the recovery has deleted
1429          *   the copy of record R.
1430          *
1431          * - Now the record is created again while node N is turned off.
1432          *   This creates R with RSN = 1 on all nodes except for N.
1433          *
1434          * - Now node N is turned on again. The following recovery will chose
1435          *   the older empty copy of R due to RSN 10 > RSN 1.
1436          *
1437          * ==> Hence the record is gone after the recovery.
1438          *
1439          * On databases like Samba's registry, this can damage the higher-level
1440          * data structures built from the various tdb-level records.
1441          */
1442         if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1443                 return 0;
1444         }
1445
1446         /* update the dmaster field to point to us */
1447         hdr = (struct ctdb_ltdb_header *)data.dptr;
1448         if (!params->persistent) {
1449                 hdr->dmaster = params->ctdb->pnn;
1450                 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1451         }
1452
1453         /* add the record to the blob ready to send to the nodes */
1454         recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1455         if (recdata == NULL) {
1456                 params->failed = true;
1457                 return -1;
1458         }
1459         if (params->len + recdata->length >= params->allocated_len) {
1460                 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1461                 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1462         }
1463         if (params->recdata == NULL) {
1464                 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1465                          recdata->length + params->len));
1466                 params->failed = true;
1467                 return -1;
1468         }
1469         params->recdata->count++;
1470         memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1471         params->len += recdata->length;
1472         talloc_free(recdata);
1473
1474         return 0;
1475 }
1476
1477 /*
1478   push the recdb database out to all nodes
1479  */
1480 static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1481                                bool persistent,
1482                                struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
1483 {
1484         struct recdb_data params;
1485         struct ctdb_marshall_buffer *recdata;
1486         TDB_DATA outdata;
1487         TALLOC_CTX *tmp_ctx;
1488         uint32_t *nodes;
1489
1490         tmp_ctx = talloc_new(ctdb);
1491         CTDB_NO_MEMORY(ctdb, tmp_ctx);
1492
1493         recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1494         CTDB_NO_MEMORY(ctdb, recdata);
1495
1496         recdata->db_id = dbid;
1497
1498         params.ctdb = ctdb;
1499         params.recdata = recdata;
1500         params.len = offsetof(struct ctdb_marshall_buffer, data);
1501         params.allocated_len = params.len;
1502         params.failed = false;
1503         params.persistent = persistent;
1504
1505         if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1506                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1507                 talloc_free(params.recdata);
1508                 talloc_free(tmp_ctx);
1509                 return -1;
1510         }
1511
1512         if (params.failed) {
1513                 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1514                 talloc_free(params.recdata);
1515                 talloc_free(tmp_ctx);
1516                 return -1;
1517         }
1518
1519         recdata = params.recdata;
1520
1521         outdata.dptr = (void *)recdata;
1522         outdata.dsize = params.len;
1523
1524         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1525         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1526                                         nodes, 0,
1527                                         CONTROL_TIMEOUT(), false, outdata,
1528                                         NULL, NULL,
1529                                         NULL) != 0) {
1530                 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1531                 talloc_free(recdata);
1532                 talloc_free(tmp_ctx);
1533                 return -1;
1534         }
1535
1536         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1537                   dbid, recdata->count));
1538
1539         talloc_free(recdata);
1540         talloc_free(tmp_ctx);
1541
1542         return 0;
1543 }
1544
1545
1546 /*
1547   go through a full recovery on one database
1548  */
1549 static int recover_database(struct ctdb_recoverd *rec,
1550                             TALLOC_CTX *mem_ctx,
1551                             uint32_t dbid,
1552                             bool persistent,
1553                             uint32_t pnn,
1554                             struct ctdb_node_map *nodemap,
1555                             uint32_t transaction_id)
1556 {
1557         struct tdb_wrap *recdb;
1558         int ret;
1559         struct ctdb_context *ctdb = rec->ctdb;
1560         TDB_DATA data;
1561         struct ctdb_control_wipe_database w;
1562         uint32_t *nodes;
1563
1564         recdb = create_recdb(ctdb, mem_ctx);
1565         if (recdb == NULL) {
1566                 return -1;
1567         }
1568
1569         /* pull all remote databases onto the recdb */
1570         ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1571         if (ret != 0) {
1572                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1573                 return -1;
1574         }
1575
1576         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1577
1578         /* wipe all the remote databases. This is safe as we are in a transaction */
1579         w.db_id = dbid;
1580         w.transaction_id = transaction_id;
1581
1582         data.dptr = (void *)&w;
1583         data.dsize = sizeof(w);
1584
1585         nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1586         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1587                                         nodes, 0,
1588                                         CONTROL_TIMEOUT(), false, data,
1589                                         NULL, NULL,
1590                                         NULL) != 0) {
1591                 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1592                 talloc_free(recdb);
1593                 return -1;
1594         }
1595
1596         /* push out the correct database. This sets the dmaster and skips
1597            the empty records */
1598         ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1599         if (ret != 0) {
1600                 talloc_free(recdb);
1601                 return -1;
1602         }
1603
1604         /* all done with this database */
1605         talloc_free(recdb);
1606
1607         return 0;
1608 }
1609
1610 static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1611                                          struct ctdb_recoverd *rec,
1612                                          struct ctdb_node_map *nodemap,
1613                                          uint32_t *culprit)
1614 {
1615         int j;
1616         int ret;
1617
1618         if (ctdb->num_nodes != nodemap->num) {
1619                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
1620                                   ctdb->num_nodes, nodemap->num));
1621                 if (culprit) {
1622                         *culprit = ctdb->pnn;
1623                 }
1624                 return -1;
1625         }
1626
1627         for (j=0; j<nodemap->num; j++) {
1628                 /* For readability */
1629                 struct ctdb_node *node = ctdb->nodes[j];
1630
1631                 /* release any existing data */
1632                 if (node->known_public_ips) {
1633                         talloc_free(node->known_public_ips);
1634                         node->known_public_ips = NULL;
1635                 }
1636                 if (node->available_public_ips) {
1637                         talloc_free(node->available_public_ips);
1638                         node->available_public_ips = NULL;
1639                 }
1640
1641                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1642                         continue;
1643                 }
1644
1645                 /* Retrieve the list of known public IPs from the node */
1646                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1647                                         CONTROL_TIMEOUT(),
1648                                         node->pnn,
1649                                         ctdb->nodes,
1650                                         0,
1651                                         &node->known_public_ips);
1652                 if (ret != 0) {
1653                         DEBUG(DEBUG_ERR,
1654                               ("Failed to read known public IPs from node: %u\n",
1655                                node->pnn));
1656                         if (culprit) {
1657                                 *culprit = node->pnn;
1658                         }
1659                         return -1;
1660                 }
1661
1662                 if (ctdb->do_checkpublicip &&
1663                     !ctdb_op_is_disabled(rec->takeover_run) &&
1664                     verify_remote_ip_allocation(ctdb,
1665                                                  node->known_public_ips,
1666                                                  node->pnn)) {
1667                         DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
1668                         rec->need_takeover_run = true;
1669                 }
1670
1671                 /* Retrieve the list of available public IPs from the node */
1672                 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1673                                         CONTROL_TIMEOUT(),
1674                                         node->pnn,
1675                                         ctdb->nodes,
1676                                         CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1677                                         &node->available_public_ips);
1678                 if (ret != 0) {
1679                         DEBUG(DEBUG_ERR,
1680                               ("Failed to read available public IPs from node: %u\n",
1681                                node->pnn));
1682                         if (culprit) {
1683                                 *culprit = node->pnn;
1684                         }
1685                         return -1;
1686                 }
1687         }
1688
1689         return 0;
1690 }
1691
1692 /* when we start a recovery, make sure all nodes use the same reclock file
1693    setting
1694 */
1695 static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1696 {
1697         struct ctdb_context *ctdb = rec->ctdb;
1698         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1699         TDB_DATA data;
1700         uint32_t *nodes;
1701
1702         if (ctdb->recovery_lock_file == NULL) {
1703                 data.dptr  = NULL;
1704                 data.dsize = 0;
1705         } else {
1706                 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1707                 data.dptr  = (uint8_t *)ctdb->recovery_lock_file;
1708         }
1709
1710         nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1711         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1712                                         nodes, 0,
1713                                         CONTROL_TIMEOUT(),
1714                                         false, data,
1715                                         NULL, NULL,
1716                                         rec) != 0) {
1717                 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1718                 talloc_free(tmp_ctx);
1719                 return -1;
1720         }
1721
1722         talloc_free(tmp_ctx);
1723         return 0;
1724 }
1725
1726
1727 /*
1728  * this callback is called for every node that failed to execute ctdb_takeover_run()
1729  * and set flag to re-run takeover run.
1730  */
1731 static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1732 {
1733         DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1734
1735         if (callback_data != NULL) {
1736                 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1737
1738                 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1739
1740                 ctdb_set_culprit(rec, node_pnn);
1741         }
1742 }
1743
1744
1745 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1746 {
1747         struct ctdb_context *ctdb = rec->ctdb;
1748         int i;
1749         struct ctdb_banning_state *ban_state;
1750
1751         *self_ban = false;
1752         for (i=0; i<ctdb->num_nodes; i++) {
1753                 if (ctdb->nodes[i]->ban_state == NULL) {
1754                         continue;
1755                 }
1756                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1757                 if (ban_state->count < 2*ctdb->num_nodes) {
1758                         continue;
1759                 }
1760
1761                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1762                         ctdb->nodes[i]->pnn, ban_state->count,
1763                         ctdb->tunable.recovery_ban_period));
1764                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1765                 ban_state->count = 0;
1766
1767                 /* Banning ourself? */
1768                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1769                         *self_ban = true;
1770                 }
1771         }
1772 }
1773
1774 static bool do_takeover_run(struct ctdb_recoverd *rec,
1775                             struct ctdb_node_map *nodemap,
1776                             bool banning_credits_on_fail)
1777 {
1778         uint32_t *nodes = NULL;
1779         struct srvid_request_data dtr;
1780         TDB_DATA data;
1781         int i;
1782         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1783         int ret;
1784         bool ok;
1785
1786         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1787
1788         if (ctdb_op_is_in_progress(rec->takeover_run)) {
1789                 DEBUG(DEBUG_ERR, (__location__
1790                                   " takeover run already in progress \n"));
1791                 ok = false;
1792                 goto done;
1793         }
1794
1795         if (!ctdb_op_begin(rec->takeover_run)) {
1796                 ok = false;
1797                 goto done;
1798         }
1799
1800         /* Disable IP checks (takeover runs, really) on other nodes
1801          * while doing this takeover run.  This will stop those other
1802          * nodes from triggering takeover runs when think they should
1803          * be hosting an IP but it isn't yet on an interface.  Don't
1804          * wait for replies since a failure here might cause some
1805          * noise in the logs but will not actually cause a problem.
1806          */
1807         dtr.srvid = 0; /* No reply */
1808         dtr.pnn = -1;
1809
1810         data.dptr  = (uint8_t*)&dtr;
1811         data.dsize = sizeof(dtr);
1812
1813         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1814
1815         /* Disable for 60 seconds.  This can be a tunable later if
1816          * necessary.
1817          */
1818         dtr.data = 60;
1819         for (i = 0; i < talloc_array_length(nodes); i++) {
1820                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1821                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1822                                              data) != 0) {
1823                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1824                 }
1825         }
1826
1827         ret = ctdb_takeover_run(rec->ctdb, nodemap,
1828                                 rec->force_rebalance_nodes,
1829                                 takeover_fail_callback,
1830                                 banning_credits_on_fail ? rec : NULL);
1831
1832         /* Reenable takeover runs and IP checks on other nodes */
1833         dtr.data = 0;
1834         for (i = 0; i < talloc_array_length(nodes); i++) {
1835                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1836                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1837                                              data) != 0) {
1838                         DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
1839                 }
1840         }
1841
1842         if (ret != 0) {
1843                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1844                 ok = false;
1845                 goto done;
1846         }
1847
1848         ok = true;
1849         /* Takeover run was successful so clear force rebalance targets */
1850         if (rebalance_nodes == rec->force_rebalance_nodes) {
1851                 TALLOC_FREE(rec->force_rebalance_nodes);
1852         } else {
1853                 DEBUG(DEBUG_WARNING,
1854                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
1855         }
1856 done:
1857         rec->need_takeover_run = !ok;
1858         talloc_free(nodes);
1859         ctdb_op_end(rec->takeover_run);
1860
1861         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1862         return ok;
1863 }
1864
1865
1866 /*
1867   we are the recmaster, and recovery is needed - start a recovery run
1868  */
1869 static int do_recovery(struct ctdb_recoverd *rec,
1870                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1871                        struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
1872 {
1873         struct ctdb_context *ctdb = rec->ctdb;
1874         int i, j, ret;
1875         uint32_t generation;
1876         struct ctdb_dbid_map *dbmap;
1877         TDB_DATA data;
1878         uint32_t *nodes;
1879         struct timeval start_time;
1880         uint32_t culprit = (uint32_t)-1;
1881         bool self_ban;
1882
1883         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1884
1885         /* if recovery fails, force it again */
1886         rec->need_recovery = true;
1887
1888         if (!ctdb_op_begin(rec->recovery)) {
1889                 return -1;
1890         }
1891
1892         if (rec->election_timeout) {
1893                 /* an election is in progress */
1894                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1895                 goto fail;
1896         }
1897
1898         ban_misbehaving_nodes(rec, &self_ban);
1899         if (self_ban) {
1900                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1901                 goto fail;
1902         }
1903
1904         if (ctdb->recovery_lock_file != NULL) {
1905                 if (ctdb_recovery_have_lock(ctdb)) {
1906                         DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
1907                 } else {
1908                         start_time = timeval_current();
1909                         DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
1910                                              ctdb->recovery_lock_file));
1911                         if (!ctdb_recovery_lock(ctdb)) {
1912                                 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
1913                                         /* If ctdb is trying first recovery, it's
1914                                          * possible that current node does not know
1915                                          * yet who the recmaster is.
1916                                          */
1917                                         DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
1918                                                           " - retrying recovery\n"));
1919                                         goto fail;
1920                                 }
1921
1922                                 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
1923                                                  "and ban ourself for %u seconds\n",
1924                                                  ctdb->tunable.recovery_ban_period));
1925                                 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
1926                                 goto fail;
1927                         }
1928                         ctdb_ctrl_report_recd_lock_latency(ctdb,
1929                                                            CONTROL_TIMEOUT(),
1930                                                            timeval_elapsed(&start_time));
1931                         DEBUG(DEBUG_NOTICE,
1932                               ("Recovery lock taken successfully by recovery daemon\n"));
1933                 }
1934         }
1935
1936         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1937
1938         /* get a list of all databases */
1939         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1940         if (ret != 0) {
1941                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1942                 goto fail;
1943         }
1944
1945         /* we do the db creation before we set the recovery mode, so the freeze happens
1946            on all databases we will be dealing with. */
1947
1948         /* verify that we have all the databases any other node has */
1949         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1950         if (ret != 0) {
1951                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1952                 goto fail;
1953         }
1954
1955         /* verify that all other nodes have all our databases */
1956         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1957         if (ret != 0) {
1958                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1959                 goto fail;
1960         }
1961         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1962
1963         /* update the database priority for all remote databases */
1964         ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
1965         if (ret != 0) {
1966                 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
1967         }
1968         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
1969
1970
1971         /* update all other nodes to use the same setting for reclock files
1972            as the local recovery master.
1973         */
1974         sync_recovery_lock_file_across_cluster(rec);
1975
1976         /* set recovery mode to active on all nodes */
1977         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
1978         if (ret != 0) {
1979                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1980                 goto fail;
1981         }
1982
1983         /* execute the "startrecovery" event script on all nodes */
1984         ret = run_startrecovery_eventscript(rec, nodemap);
1985         if (ret!=0) {
1986                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1987                 goto fail;
1988         }
1989
1990         /*
1991           update all nodes to have the same flags that we have
1992          */
1993         for (i=0;i<nodemap->num;i++) {
1994                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1995                         continue;
1996                 }
1997
1998                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1999                 if (ret != 0) {
2000                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2001                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2002                         } else {
2003                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2004                                 goto fail;
2005                         }
2006                 }
2007         }
2008
2009         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2010
2011         /* pick a new generation number */
2012         generation = new_generation();
2013
2014         /* change the vnnmap on this node to use the new generation
2015            number but not on any other nodes.
2016            this guarantees that if we abort the recovery prematurely
2017            for some reason (a node stops responding?)
2018            that we can just return immediately and we will reenter
2019            recovery shortly again.
2020            I.e. we deliberately leave the cluster with an inconsistent
2021            generation id to allow us to abort recovery at any stage and
2022            just restart it from scratch.
2023          */
2024         vnnmap->generation = generation;
2025         ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
2026         if (ret != 0) {
2027                 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
2028                 goto fail;
2029         }
2030
2031         data.dptr = (void *)&generation;
2032         data.dsize = sizeof(uint32_t);
2033
2034         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2035         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
2036                                         nodes, 0,
2037                                         CONTROL_TIMEOUT(), false, data,
2038                                         NULL,
2039                                         transaction_start_fail_callback,
2040                                         rec) != 0) {
2041                 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
2042                 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
2043                                         nodes, 0,
2044                                         CONTROL_TIMEOUT(), false, tdb_null,
2045                                         NULL,
2046                                         NULL,
2047                                         NULL) != 0) {
2048                         DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
2049                 }
2050                 goto fail;
2051         }
2052
2053         DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
2054
2055         for (i=0;i<dbmap->num;i++) {
2056                 ret = recover_database(rec, mem_ctx,
2057                                        dbmap->dbs[i].dbid,
2058                                        dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
2059                                        pnn, nodemap, generation);
2060                 if (ret != 0) {
2061                         DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
2062                         goto fail;
2063                 }
2064         }
2065
2066         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
2067
2068         /* commit all the changes */
2069         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
2070                                         nodes, 0,
2071                                         CONTROL_TIMEOUT(), false, data,
2072                                         NULL, NULL,
2073                                         NULL) != 0) {
2074                 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
2075                 goto fail;
2076         }
2077
2078         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
2079
2080
2081         /* update the capabilities for all nodes */
2082         ret = update_capabilities(rec, nodemap);
2083         if (ret!=0) {
2084                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2085                 goto fail;
2086         }
2087
2088         /* build a new vnn map with all the currently active and
2089            unbanned nodes */
2090         generation = new_generation();
2091         vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
2092         CTDB_NO_MEMORY(ctdb, vnnmap);
2093         vnnmap->generation = generation;
2094         vnnmap->size = 0;
2095         vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
2096         CTDB_NO_MEMORY(ctdb, vnnmap->map);
2097         for (i=j=0;i<nodemap->num;i++) {
2098                 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2099                         continue;
2100                 }
2101                 if (!ctdb_node_has_capabilities(rec->caps,
2102                                                 ctdb->nodes[i]->pnn,
2103                                                 CTDB_CAP_LMASTER)) {
2104                         /* this node can not be an lmaster */
2105                         DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
2106                         continue;
2107                 }
2108
2109                 vnnmap->size++;
2110                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2111                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2112                 vnnmap->map[j++] = nodemap->nodes[i].pnn;
2113
2114         }
2115         if (vnnmap->size == 0) {
2116                 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
2117                 vnnmap->size++;
2118                 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
2119                 CTDB_NO_MEMORY(ctdb, vnnmap->map);
2120                 vnnmap->map[0] = pnn;
2121         }
2122
2123         /* update to the new vnnmap on all nodes */
2124         ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
2125         if (ret != 0) {
2126                 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
2127                 goto fail;
2128         }
2129
2130         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
2131
2132         /* update recmaster to point to us for all nodes */
2133         ret = set_recovery_master(ctdb, nodemap, pnn);
2134         if (ret!=0) {
2135                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
2136                 goto fail;
2137         }
2138
2139         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
2140
2141         /* disable recovery mode */
2142         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
2143         if (ret != 0) {
2144                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
2145                 goto fail;
2146         }
2147
2148         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
2149
2150         /* Fetch known/available public IPs from each active node */
2151         ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
2152         if (ret != 0) {
2153                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2154                                  culprit));
2155                 rec->need_takeover_run = true;
2156                 goto fail;
2157         }
2158
2159         do_takeover_run(rec, nodemap, false);
2160
2161         /* execute the "recovered" event script on all nodes */
2162         ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
2163         if (ret!=0) {
2164                 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
2165                 goto fail;
2166         }
2167
2168         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
2169
2170         /* send a message to all clients telling them that the cluster
2171            has been reconfigured */
2172         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2173                                        CTDB_SRVID_RECONFIGURE, tdb_null);
2174         if (ret != 0) {
2175                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2176                 goto fail;
2177         }
2178
2179         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2180
2181         rec->need_recovery = false;
2182         ctdb_op_end(rec->recovery);
2183
2184         /* we managed to complete a full recovery, make sure to forgive
2185            any past sins by the nodes that could now participate in the
2186            recovery.
2187         */
2188         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2189         for (i=0;i<nodemap->num;i++) {
2190                 struct ctdb_banning_state *ban_state;
2191
2192                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2193                         continue;
2194                 }
2195
2196                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2197                 if (ban_state == NULL) {
2198                         continue;
2199                 }
2200
2201                 ban_state->count = 0;
2202         }
2203
2204         /* We just finished a recovery successfully.
2205            We now wait for rerecovery_timeout before we allow
2206            another recovery to take place.
2207         */
2208         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2209         ctdb_op_disable(rec->recovery, ctdb->ev,
2210                         ctdb->tunable.rerecovery_timeout);
2211         return 0;
2212
2213 fail:
2214         ctdb_op_end(rec->recovery);
2215         return -1;
2216 }
2217
2218
2219 /*
2220   elections are won by first checking the number of connected nodes, then
2221   the priority time, then the pnn
2222  */
2223 struct election_message {
2224         uint32_t num_connected;
2225         struct timeval priority_time;
2226         uint32_t pnn;
2227         uint32_t node_flags;
2228 };
2229
2230 /*
2231   form this nodes election data
2232  */
2233 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2234 {
2235         int ret, i;
2236         struct ctdb_node_map *nodemap;
2237         struct ctdb_context *ctdb = rec->ctdb;
2238
2239         ZERO_STRUCTP(em);
2240
2241         em->pnn = rec->ctdb->pnn;
2242         em->priority_time = rec->priority_time;
2243
2244         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2245         if (ret != 0) {
2246                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2247                 return;
2248         }
2249
2250         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2251         em->node_flags = rec->node_flags;
2252
2253         for (i=0;i<nodemap->num;i++) {
2254                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2255                         em->num_connected++;
2256                 }
2257         }
2258
2259         /* we shouldnt try to win this election if we cant be a recmaster */
2260         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2261                 em->num_connected = 0;
2262                 em->priority_time = timeval_current();
2263         }
2264
2265         talloc_free(nodemap);
2266 }
2267
2268 /*
2269   see if the given election data wins
2270  */
2271 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2272 {
2273         struct election_message myem;
2274         int cmp = 0;
2275
2276         ctdb_election_data(rec, &myem);
2277
2278         /* we cant win if we dont have the recmaster capability */
2279         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2280                 return false;
2281         }
2282
2283         /* we cant win if we are banned */
2284         if (rec->node_flags & NODE_FLAGS_BANNED) {
2285                 return false;
2286         }
2287
2288         /* we cant win if we are stopped */
2289         if (rec->node_flags & NODE_FLAGS_STOPPED) {
2290                 return false;
2291         }
2292
2293         /* we will automatically win if the other node is banned */
2294         if (em->node_flags & NODE_FLAGS_BANNED) {
2295                 return true;
2296         }
2297
2298         /* we will automatically win if the other node is banned */
2299         if (em->node_flags & NODE_FLAGS_STOPPED) {
2300                 return true;
2301         }
2302
2303         /* try to use the most connected node */
2304         if (cmp == 0) {
2305                 cmp = (int)myem.num_connected - (int)em->num_connected;
2306         }
2307
2308         /* then the longest running node */
2309         if (cmp == 0) {
2310                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2311         }
2312
2313         if (cmp == 0) {
2314                 cmp = (int)myem.pnn - (int)em->pnn;
2315         }
2316
2317         return cmp > 0;
2318 }
2319
2320 /*
2321   send out an election request
2322  */
2323 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2324 {
2325         int ret;
2326         TDB_DATA election_data;
2327         struct election_message emsg;
2328         uint64_t srvid;
2329         struct ctdb_context *ctdb = rec->ctdb;
2330
2331         srvid = CTDB_SRVID_RECOVERY;
2332
2333         ctdb_election_data(rec, &emsg);
2334
2335         election_data.dsize = sizeof(struct election_message);
2336         election_data.dptr  = (unsigned char *)&emsg;
2337
2338
2339         /* first we assume we will win the election and set
2340            recoverymaster to be ourself on the current node
2341          */
2342         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
2343         if (ret != 0) {
2344                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
2345                 return -1;
2346         }
2347
2348
2349         /* send an election message to all active nodes */
2350         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2351         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2352 }
2353
2354 /*
2355   this function will unban all nodes in the cluster
2356 */
2357 static void unban_all_nodes(struct ctdb_context *ctdb)
2358 {
2359         int ret, i;
2360         struct ctdb_node_map *nodemap;
2361         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2362
2363         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2364         if (ret != 0) {
2365                 DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
2366                 return;
2367         }
2368
2369         for (i=0;i<nodemap->num;i++) {
2370                 if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
2371                   && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
2372                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(),
2373                                                  nodemap->nodes[i].pnn, 0,
2374                                                  NODE_FLAGS_BANNED);
2375                         if (ret != 0) {
2376                                 DEBUG(DEBUG_ERR, (__location__ " failed to reset ban state\n"));
2377                         }
2378                 }
2379         }
2380
2381         talloc_free(tmp_ctx);
2382 }
2383
2384
2385 /*
2386   we think we are winning the election - send a broadcast election request
2387  */
2388 static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
2389 {
2390         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2391         int ret;
2392
2393         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2394         if (ret != 0) {
2395                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2396         }
2397
2398         talloc_free(rec->send_election_te);
2399         rec->send_election_te = NULL;
2400 }
2401
2402 /*
2403   handler for memory dumps
2404 */
2405 static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid,
2406                              TDB_DATA data, void *private_data)
2407 {
2408         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2409         TDB_DATA *dump;
2410         int ret;
2411         struct srvid_request *rd;
2412
2413         if (data.dsize != sizeof(struct srvid_request)) {
2414                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2415                 talloc_free(tmp_ctx);
2416                 return;
2417         }
2418         rd = (struct srvid_request *)data.dptr;
2419
2420         dump = talloc_zero(tmp_ctx, TDB_DATA);
2421         if (dump == NULL) {
2422                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2423                 talloc_free(tmp_ctx);
2424                 return;
2425         }
2426         ret = ctdb_dump_memory(ctdb, dump);
2427         if (ret != 0) {
2428                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2429                 talloc_free(tmp_ctx);
2430                 return;
2431         }
2432
2433 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2434
2435         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2436         if (ret != 0) {
2437                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2438                 talloc_free(tmp_ctx);
2439                 return;
2440         }
2441
2442         talloc_free(tmp_ctx);
2443 }
2444
2445 /*
2446   handler for reload_nodes
2447 */
2448 static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid,
2449                              TDB_DATA data, void *private_data)
2450 {
2451         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2452
2453         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2454
2455         ctdb_load_nodes_file(rec->ctdb);
2456 }
2457
2458
2459 static void ctdb_rebalance_timeout(struct event_context *ev,
2460                                    struct timed_event *te,
2461                                    struct timeval t, void *p)
2462 {
2463         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2464
2465         if (rec->force_rebalance_nodes == NULL) {
2466                 DEBUG(DEBUG_ERR,
2467                       ("Rebalance timeout occurred - no nodes to rebalance\n"));
2468                 return;
2469         }
2470
2471         DEBUG(DEBUG_NOTICE,
2472               ("Rebalance timeout occurred - do takeover run\n"));
2473         do_takeover_run(rec, rec->nodemap, false);
2474 }
2475
2476
2477 static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
2478                                         uint64_t srvid,
2479                                         TDB_DATA data, void *private_data)
2480 {
2481         uint32_t pnn;
2482         uint32_t *t;
2483         int len;
2484         uint32_t deferred_rebalance;
2485         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2486
2487         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2488                 return;
2489         }
2490
2491         if (data.dsize != sizeof(uint32_t)) {
2492                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2493                 return;
2494         }
2495
2496         pnn = *(uint32_t *)&data.dptr[0];
2497
2498         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2499
2500         /* Copy any existing list of nodes.  There's probably some
2501          * sort of realloc variant that will do this but we need to
2502          * make sure that freeing the old array also cancels the timer
2503          * event for the timeout... not sure if realloc will do that.
2504          */
2505         len = (rec->force_rebalance_nodes != NULL) ?
2506                 talloc_array_length(rec->force_rebalance_nodes) :
2507                 0;
2508
2509         /* This allows duplicates to be added but they don't cause
2510          * harm.  A call to add a duplicate PNN arguably means that
2511          * the timeout should be reset, so this is the simplest
2512          * solution.
2513          */
2514         t = talloc_zero_array(rec, uint32_t, len+1);
2515         CTDB_NO_MEMORY_VOID(ctdb, t);
2516         if (len > 0) {
2517                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2518         }
2519         t[len] = pnn;
2520
2521         talloc_free(rec->force_rebalance_nodes);
2522
2523         rec->force_rebalance_nodes = t;
2524
2525         /* If configured, setup a deferred takeover run to make sure
2526          * that certain nodes get IPs rebalanced to them.  This will
2527          * be cancelled if a successful takeover run happens before
2528          * the timeout.  Assign tunable value to variable for
2529          * readability.
2530          */
2531         deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2532         if (deferred_rebalance != 0) {
2533                 event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
2534                                 timeval_current_ofs(deferred_rebalance, 0),
2535                                 ctdb_rebalance_timeout, rec);
2536         }
2537 }
2538
2539
2540
2541 static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid,
2542                              TDB_DATA data, void *private_data)
2543 {
2544         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2545         struct ctdb_public_ip *ip;
2546
2547         if (rec->recmaster != rec->ctdb->pnn) {
2548                 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2549                 return;
2550         }
2551
2552         if (data.dsize != sizeof(struct ctdb_public_ip)) {
2553                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2554                 return;
2555         }
2556
2557         ip = (struct ctdb_public_ip *)data.dptr;
2558
2559         update_ip_assignment_tree(rec->ctdb, ip);
2560 }
2561
2562 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2563                                     TDB_DATA data,
2564                                     struct ctdb_op_state *op_state)
2565 {
2566         struct srvid_request_data *r;
2567         uint32_t timeout;
2568         TDB_DATA result;
2569         int32_t ret = 0;
2570
2571         /* Validate input data */
2572         if (data.dsize != sizeof(struct srvid_request_data)) {
2573                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2574                                  "expecting %lu\n", (long unsigned)data.dsize,
2575                                  (long unsigned)sizeof(struct srvid_request)));
2576                 return;
2577         }
2578         if (data.dptr == NULL) {
2579                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2580                 return;
2581         }
2582
2583         r = (struct srvid_request_data *)data.dptr;
2584         timeout = r->data;
2585
2586         ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2587         if (ret != 0) {
2588                 goto done;
2589         }
2590
2591         /* Returning our PNN tells the caller that we succeeded */
2592         ret = ctdb_get_pnn(ctdb);
2593 done:
2594         result.dsize = sizeof(int32_t);
2595         result.dptr  = (uint8_t *)&ret;
2596         srvid_request_reply(ctdb, (struct srvid_request *)r, result);
2597 }
2598
2599 static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
2600                                           uint64_t srvid, TDB_DATA data,
2601                                           void *private_data)
2602 {
2603         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2604                                                     struct ctdb_recoverd);
2605
2606         srvid_disable_and_reply(ctdb, data, rec->takeover_run);
2607 }
2608
2609 /* Backward compatibility for this SRVID */
2610 static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
2611                                      TDB_DATA data, void *private_data)
2612 {
2613         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2614                                                     struct ctdb_recoverd);
2615         uint32_t timeout;
2616
2617         if (data.dsize != sizeof(uint32_t)) {
2618                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2619                                  "expecting %lu\n", (long unsigned)data.dsize,
2620                                  (long unsigned)sizeof(uint32_t)));
2621                 return;
2622         }
2623         if (data.dptr == NULL) {
2624                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2625                 return;
2626         }
2627
2628         timeout = *((uint32_t *)data.dptr);
2629
2630         ctdb_op_disable(rec->takeover_run, ctdb->ev, timeout);
2631 }
2632
2633 static void disable_recoveries_handler(struct ctdb_context *ctdb,
2634                                        uint64_t srvid, TDB_DATA data,
2635                                        void *private_data)
2636 {
2637         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2638                                                     struct ctdb_recoverd);
2639
2640         srvid_disable_and_reply(ctdb, data, rec->recovery);
2641 }
2642
2643 /*
2644   handler for ip reallocate, just add it to the list of requests and
2645   handle this later in the monitor_cluster loop so we do not recurse
2646   with other requests to takeover_run()
2647 */
2648 static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
2649                                   TDB_DATA data, void *private_data)
2650 {
2651         struct srvid_request *request;
2652         struct ctdb_recoverd *rec = talloc_get_type(private_data,
2653                                                     struct ctdb_recoverd);
2654
2655         if (data.dsize != sizeof(struct srvid_request)) {
2656                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2657                 return;
2658         }
2659
2660         request = (struct srvid_request *)data.dptr;
2661
2662         srvid_request_add(ctdb, &rec->reallocate_requests, request);
2663 }
2664
2665 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2666                                           struct ctdb_recoverd *rec)
2667 {
2668         TDB_DATA result;
2669         int32_t ret;
2670         uint32_t culprit;
2671         struct srvid_requests *current;
2672
2673         DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2674
2675         /* Only process requests that are currently pending.  More
2676          * might come in while the takeover run is in progress and
2677          * they will need to be processed later since they might
2678          * be in response flag changes.
2679          */
2680         current = rec->reallocate_requests;
2681         rec->reallocate_requests = NULL;
2682
2683         /* update the list of public ips that a node can handle for
2684            all connected nodes
2685         */
2686         ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
2687         if (ret != 0) {
2688                 DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
2689                                  culprit));
2690                 rec->need_takeover_run = true;
2691         }
2692         if (ret == 0) {
2693                 if (do_takeover_run(rec, rec->nodemap, false)) {
2694                         ret = ctdb_get_pnn(ctdb);
2695                 } else {
2696                         ret = -1;
2697                 }
2698         }
2699
2700         result.dsize = sizeof(int32_t);
2701         result.dptr  = (uint8_t *)&ret;
2702
2703         srvid_requests_reply(ctdb, &current, result);
2704 }
2705
2706
2707 /*
2708   handler for recovery master elections
2709 */
2710 static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
2711                              TDB_DATA data, void *private_data)
2712 {
2713         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2714         int ret;
2715         struct election_message *em = (struct election_message *)data.dptr;
2716
2717         /* Ignore election packets from ourself */
2718         if (ctdb->pnn == em->pnn) {
2719                 return;
2720         }
2721
2722         /* we got an election packet - update the timeout for the election */
2723         talloc_free(rec->election_timeout);
2724         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2725                                                 fast_start ?
2726                                                 timeval_current_ofs(0, 500000) :
2727                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2728                                                 ctdb_election_timeout, rec);
2729
2730         /* someone called an election. check their election data
2731            and if we disagree and we would rather be the elected node,
2732            send a new election message to all other nodes
2733          */
2734         if (ctdb_election_win(rec, em)) {
2735                 if (!rec->send_election_te) {
2736                         rec->send_election_te = event_add_timed(ctdb->ev, rec,
2737                                                                 timeval_current_ofs(0, 500000),
2738                                                                 election_send_request, rec);
2739                 }
2740                 /*unban_all_nodes(ctdb);*/
2741                 return;
2742         }
2743
2744         /* we didn't win */
2745         TALLOC_FREE(rec->send_election_te);
2746
2747         /* Release the recovery lock file */
2748         if (ctdb_recovery_have_lock(ctdb)) {
2749                 ctdb_recovery_unlock(ctdb);
2750                 unban_all_nodes(ctdb);
2751         }
2752
2753         /* ok, let that guy become recmaster then */
2754         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
2755         if (ret != 0) {
2756                 DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
2757                 return;
2758         }
2759
2760         return;
2761 }
2762
2763
2764 /*
2765   force the start of the election process
2766  */
2767 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2768                            struct ctdb_node_map *nodemap)
2769 {
2770         int ret;
2771         struct ctdb_context *ctdb = rec->ctdb;
2772
2773         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2774
2775         /* set all nodes to recovery mode to stop all internode traffic */
2776         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2777         if (ret != 0) {
2778                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2779                 return;
2780         }
2781
2782         talloc_free(rec->election_timeout);
2783         rec->election_timeout = event_add_timed(ctdb->ev, ctdb,
2784                                                 fast_start ?
2785                                                 timeval_current_ofs(0, 500000) :
2786                                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2787                                                 ctdb_election_timeout, rec);
2788
2789         ret = send_election_request(rec, pnn);
2790         if (ret!=0) {
2791                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2792                 return;
2793         }
2794
2795         /* wait for a few seconds to collect all responses */
2796         ctdb_wait_election(rec);
2797 }
2798
2799
2800
2801 /*
2802   handler for when a node changes its flags
2803 */
2804 static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
2805                             TDB_DATA data, void *private_data)
2806 {
2807         int ret;
2808         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2809         struct ctdb_node_map *nodemap=NULL;
2810         TALLOC_CTX *tmp_ctx;
2811         int i;
2812         struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
2813         int disabled_flag_changed;
2814
2815         if (data.dsize != sizeof(*c)) {
2816                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2817                 return;
2818         }
2819
2820         tmp_ctx = talloc_new(ctdb);
2821         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2822
2823         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2824         if (ret != 0) {
2825                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2826                 talloc_free(tmp_ctx);
2827                 return;
2828         }
2829
2830
2831         for (i=0;i<nodemap->num;i++) {
2832                 if (nodemap->nodes[i].pnn == c->pnn) break;
2833         }
2834
2835         if (i == nodemap->num) {
2836                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2837                 talloc_free(tmp_ctx);
2838                 return;
2839         }
2840
2841         if (c->old_flags != c->new_flags) {
2842                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2843         }
2844
2845         disabled_flag_changed =  (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2846
2847         nodemap->nodes[i].flags = c->new_flags;
2848
2849         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2850                                      CTDB_CURRENT_NODE, &ctdb->recovery_master);
2851
2852         if (ret == 0) {
2853                 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2854                                            CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2855         }
2856
2857         if (ret == 0 &&
2858             ctdb->recovery_master == ctdb->pnn &&
2859             ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2860                 /* Only do the takeover run if the perm disabled or unhealthy
2861                    flags changed since these will cause an ip failover but not
2862                    a recovery.
2863                    If the node became disconnected or banned this will also
2864                    lead to an ip address failover but that is handled
2865                    during recovery
2866                 */
2867                 if (disabled_flag_changed) {
2868                         rec->need_takeover_run = true;
2869                 }
2870         }
2871
2872         talloc_free(tmp_ctx);
2873 }
2874
2875 /*
2876   handler for when we need to push out flag changes ot all other nodes
2877 */
2878 static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid,
2879                             TDB_DATA data, void *private_data)
2880 {
2881         int ret;
2882         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2883         struct ctdb_node_map *nodemap=NULL;
2884         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2885         uint32_t recmaster;
2886         uint32_t *nodes;
2887
2888         /* find the recovery master */
2889         ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
2890         if (ret != 0) {
2891                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
2892                 talloc_free(tmp_ctx);
2893                 return;
2894         }
2895
2896         /* read the node flags from the recmaster */
2897         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
2898         if (ret != 0) {
2899                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2900                 talloc_free(tmp_ctx);
2901                 return;
2902         }
2903         if (c->pnn >= nodemap->num) {
2904                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2905                 talloc_free(tmp_ctx);
2906                 return;
2907         }
2908
2909         /* send the flags update to all connected nodes */
2910         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2911
2912         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2913                                       nodes, 0, CONTROL_TIMEOUT(),
2914                                       false, data,
2915                                       NULL, NULL,
2916                                       NULL) != 0) {
2917                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2918
2919                 talloc_free(tmp_ctx);
2920                 return;
2921         }
2922
2923         talloc_free(tmp_ctx);
2924 }
2925
2926
2927 struct verify_recmode_normal_data {
2928         uint32_t count;
2929         enum monitor_result status;
2930 };
2931
2932 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2933 {
2934         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2935
2936
2937         /* one more node has responded with recmode data*/
2938         rmdata->count--;
2939
2940         /* if we failed to get the recmode, then return an error and let
2941            the main loop try again.
2942         */
2943         if (state->state != CTDB_CONTROL_DONE) {
2944                 if (rmdata->status == MONITOR_OK) {
2945                         rmdata->status = MONITOR_FAILED;
2946                 }
2947                 return;
2948         }
2949
2950         /* if we got a response, then the recmode will be stored in the
2951            status field
2952         */
2953         if (state->status != CTDB_RECOVERY_NORMAL) {
2954                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2955                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2956         }
2957
2958         return;
2959 }
2960
2961
2962 /* verify that all nodes are in normal recovery mode */
2963 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
2964 {
2965         struct verify_recmode_normal_data *rmdata;
2966         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2967         struct ctdb_client_control_state *state;
2968         enum monitor_result status;
2969         int j;
2970
2971         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2972         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2973         rmdata->count  = 0;
2974         rmdata->status = MONITOR_OK;
2975
2976         /* loop over all active nodes and send an async getrecmode call to
2977            them*/
2978         for (j=0; j<nodemap->num; j++) {
2979                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2980                         continue;
2981                 }
2982                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2983                                         CONTROL_TIMEOUT(),
2984                                         nodemap->nodes[j].pnn);
2985                 if (state == NULL) {
2986                         /* we failed to send the control, treat this as
2987                            an error and try again next iteration
2988                         */
2989                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2990                         talloc_free(mem_ctx);
2991                         return MONITOR_FAILED;
2992                 }
2993
2994                 /* set up the callback functions */
2995                 state->async.fn = verify_recmode_normal_callback;
2996                 state->async.private_data = rmdata;
2997
2998                 /* one more control to wait for to complete */
2999                 rmdata->count++;
3000         }
3001
3002
3003         /* now wait for up to the maximum number of seconds allowed
3004            or until all nodes we expect a response from has replied
3005         */
3006         while (rmdata->count > 0) {
3007                 event_loop_once(ctdb->ev);
3008         }
3009
3010         status = rmdata->status;
3011         talloc_free(mem_ctx);
3012         return status;
3013 }
3014
3015
3016 struct verify_recmaster_data {
3017         struct ctdb_recoverd *rec;
3018         uint32_t count;
3019         uint32_t pnn;
3020         enum monitor_result status;
3021 };
3022
3023 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3024 {
3025         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3026
3027
3028         /* one more node has responded with recmaster data*/
3029         rmdata->count--;
3030
3031         /* if we failed to get the recmaster, then return an error and let
3032            the main loop try again.
3033         */
3034         if (state->state != CTDB_CONTROL_DONE) {
3035                 if (rmdata->status == MONITOR_OK) {
3036                         rmdata->status = MONITOR_FAILED;
3037                 }
3038                 return;
3039         }
3040
3041         /* if we got a response, then the recmaster will be stored in the
3042            status field
3043         */
3044         if (state->status != rmdata->pnn) {
3045                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3046                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3047                 rmdata->status = MONITOR_ELECTION_NEEDED;
3048         }
3049
3050         return;
3051 }
3052
3053
3054 /* verify that all nodes agree that we are the recmaster */
3055 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
3056 {
3057         struct ctdb_context *ctdb = rec->ctdb;
3058         struct verify_recmaster_data *rmdata;
3059         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3060         struct ctdb_client_control_state *state;
3061         enum monitor_result status;
3062         int j;
3063
3064         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3065         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3066         rmdata->rec    = rec;
3067         rmdata->count  = 0;
3068         rmdata->pnn    = pnn;
3069         rmdata->status = MONITOR_OK;
3070
3071         /* loop over all active nodes and send an async getrecmaster call to
3072            them*/
3073         for (j=0; j<nodemap->num; j++) {
3074                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3075                         continue;
3076                 }
3077                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3078                                         CONTROL_TIMEOUT(),
3079                                         nodemap->nodes[j].pnn);
3080                 if (state == NULL) {
3081                         /* we failed to send the control, treat this as
3082                            an error and try again next iteration
3083                         */
3084                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3085                         talloc_free(mem_ctx);
3086                         return MONITOR_FAILED;
3087                 }
3088
3089                 /* set up the callback functions */
3090                 state->async.fn = verify_recmaster_callback;
3091                 state->async.private_data = rmdata;
3092
3093                 /* one more control to wait for to complete */
3094                 rmdata->count++;
3095         }
3096
3097
3098         /* now wait for up to the maximum number of seconds allowed
3099            or until all nodes we expect a response from has replied
3100         */
3101         while (rmdata->count > 0) {
3102                 event_loop_once(ctdb->ev);
3103         }
3104
3105         status = rmdata->status;
3106         talloc_free(mem_ctx);
3107         return status;
3108 }
3109
3110 static bool interfaces_have_changed(struct ctdb_context *ctdb,
3111                                     struct ctdb_recoverd *rec)
3112 {
3113         struct ctdb_control_get_ifaces *ifaces = NULL;
3114         TALLOC_CTX *mem_ctx;
3115         bool ret = false;
3116
3117         mem_ctx = talloc_new(NULL);
3118
3119         /* Read the interfaces from the local node */
3120         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3121                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3122                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3123                 /* We could return an error.  However, this will be
3124                  * rare so we'll decide that the interfaces have
3125                  * actually changed, just in case.
3126                  */
3127                 talloc_free(mem_ctx);
3128                 return true;
3129         }
3130
3131         if (!rec->ifaces) {
3132                 /* We haven't been here before so things have changed */
3133                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3134                 ret = true;
3135         } else if (rec->ifaces->num != ifaces->num) {
3136                 /* Number of interfaces has changed */
3137                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3138                                      rec->ifaces->num, ifaces->num));
3139                 ret = true;
3140         } else {
3141                 /* See if interface names or link states have changed */
3142                 int i;
3143                 for (i = 0; i < rec->ifaces->num; i++) {
3144                         struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
3145                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3146                                 DEBUG(DEBUG_NOTICE,
3147                                       ("Interface in slot %d changed: %s => %s\n",
3148                                        i, iface->name, ifaces->ifaces[i].name));
3149                                 ret = true;
3150                                 break;
3151                         }
3152                         if (iface->link_state != ifaces->ifaces[i].link_state) {
3153                                 DEBUG(DEBUG_NOTICE,
3154                                       ("Interface %s changed state: %d => %d\n",
3155                                        iface->name, iface->link_state,
3156                                        ifaces->ifaces[i].link_state));
3157                                 ret = true;
3158                                 break;
3159                         }
3160                 }
3161         }
3162
3163         talloc_free(rec->ifaces);
3164         rec->ifaces = talloc_steal(rec, ifaces);
3165
3166         talloc_free(mem_ctx);
3167         return ret;
3168 }
3169
3170 /* called to check that the local allocation of public ip addresses is ok.
3171 */
3172 static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
3173 {
3174         TALLOC_CTX *mem_ctx = talloc_new(NULL);
3175         struct ctdb_uptime *uptime1 = NULL;
3176         struct ctdb_uptime *uptime2 = NULL;
3177         int ret, j;
3178         bool need_takeover_run = false;
3179
3180         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3181                                 CTDB_CURRENT_NODE, &uptime1);
3182         if (ret != 0) {
3183                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3184                 talloc_free(mem_ctx);
3185                 return -1;
3186         }
3187
3188         if (interfaces_have_changed(ctdb, rec)) {
3189                 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3190                                      "local node %u - force takeover run\n",
3191                                      pnn));
3192                 need_takeover_run = true;
3193         }
3194
3195         ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
3196                                 CTDB_CURRENT_NODE, &uptime2);
3197         if (ret != 0) {
3198                 DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
3199                 talloc_free(mem_ctx);
3200                 return -1;
3201         }
3202
3203         /* skip the check if the startrecovery time has changed */
3204         if (timeval_compare(&uptime1->last_recovery_started,
3205                             &uptime2->last_recovery_started) != 0) {
3206                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3207                 talloc_free(mem_ctx);
3208                 return 0;
3209         }
3210
3211         /* skip the check if the endrecovery time has changed */
3212         if (timeval_compare(&uptime1->last_recovery_finished,
3213                             &uptime2->last_recovery_finished) != 0) {
3214                 DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
3215                 talloc_free(mem_ctx);
3216                 return 0;
3217         }
3218
3219         /* skip the check if we have started but not finished recovery */
3220         if (timeval_compare(&uptime1->last_recovery_finished,
3221                             &uptime1->last_recovery_started) != 1) {
3222                 DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
3223                 talloc_free(mem_ctx);
3224
3225                 return 0;
3226         }
3227
3228         /* verify that we have the ip addresses we should have
3229            and we dont have ones we shouldnt have.
3230            if we find an inconsistency we set recmode to
3231            active on the local node and wait for the recmaster
3232            to do a full blown recovery.
3233            also if the pnn is -1 and we are healthy and can host the ip
3234            we also request a ip reallocation.
3235         */
3236         if (ctdb->tunable.disable_ip_failover == 0) {
3237                 struct ctdb_all_public_ips *ips = NULL;
3238
3239                 /* read the *available* IPs from the local node */
3240                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3241                 if (ret != 0) {
3242                         DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3243                         talloc_free(mem_ctx);
3244                         return -1;
3245                 }
3246
3247                 for (j=0; j<ips->num; j++) {
3248                         if (ips->ips[j].pnn == -1 &&
3249                             nodemap->nodes[pnn].flags == 0) {
3250                                 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3251                                                   ctdb_addr_to_str(&ips->ips[j].addr)));
3252                                 need_takeover_run = true;
3253                         }
3254                 }
3255
3256                 talloc_free(ips);
3257
3258                 /* read the *known* IPs from the local node */
3259                 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3260                 if (ret != 0) {
3261                         DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3262                         talloc_free(mem_ctx);
3263                         return -1;
3264                 }
3265
3266                 for (j=0; j<ips->num; j++) {
3267                         if (ips->ips[j].pnn == pnn) {
3268                                 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3269                                         DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3270                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3271                                         need_takeover_run = true;
3272                                 }
3273                         } else {
3274                                 if (ctdb->do_checkpublicip &&
3275                                     ctdb_sys_have_ip(&ips->ips[j].addr)) {
3276
3277                                         DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3278                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
3279
3280                                         if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3281                                                 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3282                                         }
3283                                 }
3284                         }
3285                 }
3286         }
3287
3288         if (need_takeover_run) {
3289                 struct srvid_request rd;
3290                 TDB_DATA data;
3291
3292                 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3293
3294                 rd.pnn = ctdb->pnn;
3295                 rd.srvid = 0;
3296                 data.dptr = (uint8_t *)&rd;
3297                 data.dsize = sizeof(rd);
3298
3299                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3300                 if (ret != 0) {
3301                         DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3302                 }
3303         }
3304         talloc_free(mem_ctx);
3305         return 0;
3306 }
3307
3308
3309 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3310 {
3311         struct ctdb_node_map **remote_nodemaps = callback_data;
3312
3313         if (node_pnn >= ctdb->num_nodes) {
3314                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3315                 return;
3316         }
3317
3318         remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
3319
3320 }
3321
3322 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3323         struct ctdb_node_map *nodemap,
3324         struct ctdb_node_map **remote_nodemaps)
3325 {
3326         uint32_t *nodes;
3327
3328         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3329         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3330                                         nodes, 0,
3331                                         CONTROL_TIMEOUT(), false, tdb_null,
3332                                         async_getnodemap_callback,
3333                                         NULL,
3334                                         remote_nodemaps) != 0) {
3335                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3336
3337                 return -1;
3338         }
3339
3340         return 0;
3341 }
3342
3343 static int update_recovery_lock_file(struct ctdb_context *ctdb)
3344 {
3345         TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3346         const char *reclockfile;
3347
3348         if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3349                 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3350                 talloc_free(tmp_ctx);
3351                 return -1;
3352         }
3353
3354         if (reclockfile == NULL) {
3355                 if (ctdb->recovery_lock_file != NULL) {
3356                         DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3357                         talloc_free(ctdb->recovery_lock_file);
3358                         ctdb->recovery_lock_file = NULL;
3359                         ctdb_recovery_unlock(ctdb);
3360                 }
3361                 talloc_free(tmp_ctx);
3362                 return 0;
3363         }
3364
3365         if (ctdb->recovery_lock_file == NULL) {
3366                 DEBUG(DEBUG_NOTICE,
3367                       ("Recovery lock file enabled (%s)\n", reclockfile));
3368                 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3369                 ctdb_recovery_unlock(ctdb);
3370                 talloc_free(tmp_ctx);
3371                 return 0;
3372         }
3373
3374
3375         if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3376                 talloc_free(tmp_ctx);
3377                 return 0;
3378         }
3379
3380         DEBUG(DEBUG_NOTICE,
3381               ("Recovery lock file changed (now %s)\n", reclockfile));
3382         talloc_free(ctdb->recovery_lock_file);
3383         ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3384         ctdb_recovery_unlock(ctdb);
3385
3386         talloc_free(tmp_ctx);
3387         return 0;
3388 }
3389
3390 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3391                       TALLOC_CTX *mem_ctx)
3392 {
3393         uint32_t pnn;
3394         struct ctdb_node_map *nodemap=NULL;
3395         struct ctdb_node_map *recmaster_nodemap=NULL;
3396         struct ctdb_node_map **remote_nodemaps=NULL;
3397         struct ctdb_vnn_map *vnnmap=NULL;
3398         struct ctdb_vnn_map *remote_vnnmap=NULL;
3399         uint32_t num_lmasters;
3400         int32_t debug_level;
3401         int i, j, ret;
3402         bool self_ban;
3403
3404
3405         /* verify that the main daemon is still running */
3406         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3407                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3408                 exit(-1);
3409         }
3410
3411         /* ping the local daemon to tell it we are alive */
3412         ctdb_ctrl_recd_ping(ctdb);
3413
3414         if (rec->election_timeout) {
3415                 /* an election is in progress */
3416                 return;
3417         }
3418
3419         /* read the debug level from the parent and update locally */
3420         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3421         if (ret !=0) {
3422                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3423                 return;
3424         }
3425         DEBUGLEVEL = debug_level;
3426
3427         /* get relevant tunables */
3428         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3429         if (ret != 0) {
3430                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3431                 return;
3432         }
3433
3434         /* get runstate */
3435         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3436                                      CTDB_CURRENT_NODE, &ctdb->runstate);
3437         if (ret != 0) {
3438                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3439                 return;
3440         }
3441
3442         /* get the current recovery lock file from the server */
3443         if (update_recovery_lock_file(ctdb) != 0) {
3444                 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3445                 return;
3446         }
3447
3448         /* Make sure that if recovery lock verification becomes disabled when
3449            we close the file
3450         */
3451         if (ctdb->recovery_lock_file == NULL) {
3452                 ctdb_recovery_unlock(ctdb);
3453         }
3454
3455         pnn = ctdb_get_pnn(ctdb);
3456
3457         /* get the vnnmap */
3458         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3459         if (ret != 0) {
3460                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3461                 return;
3462         }
3463
3464
3465         /* get number of nodes */
3466         if (rec->nodemap) {
3467                 talloc_free(rec->nodemap);
3468                 rec->nodemap = NULL;
3469                 nodemap=NULL;
3470         }
3471         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3472         if (ret != 0) {
3473                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3474                 return;
3475         }
3476         nodemap = rec->nodemap;
3477
3478         /* remember our own node flags */
3479         rec->node_flags = nodemap->nodes[pnn].flags;
3480
3481         ban_misbehaving_nodes(rec, &self_ban);
3482         if (self_ban) {
3483                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3484                 return;
3485         }
3486
3487         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3488            also frozen and that the recmode is set to active.
3489         */
3490         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3491                 /* If this node has become inactive then we want to
3492                  * reduce the chances of it taking over the recovery
3493                  * master role when it becomes active again.  This
3494                  * helps to stabilise the recovery master role so that
3495                  * it stays on the most stable node.
3496                  */
3497                 rec->priority_time = timeval_current();
3498
3499                 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3500                 if (ret != 0) {
3501                         DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3502                 }
3503                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3504                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3505
3506                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3507                         if (ret != 0) {
3508                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3509
3510                                 return;
3511                         }
3512                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE);
3513                         if (ret != 0) {
3514                                 DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
3515                                 return;
3516                         }
3517                 }
3518
3519                 /* If this node is stopped or banned then it is not the recovery
3520                  * master, so don't do anything. This prevents stopped or banned
3521                  * node from starting election and sending unnecessary controls.
3522                  */
3523                 return;
3524         }
3525
3526         /* check which node is the recovery master */
3527         ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
3528         if (ret != 0) {
3529                 DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
3530                 return;
3531         }
3532
3533         /* If we are not the recmaster then do some housekeeping */
3534         if (rec->recmaster != pnn) {
3535                 /* Ignore any IP reallocate requests - only recmaster
3536                  * processes them
3537                  */
3538                 TALLOC_FREE(rec->reallocate_requests);
3539                 /* Clear any nodes that should be force rebalanced in
3540                  * the next takeover run.  If the recovery master role
3541                  * has moved then we don't want to process these some
3542                  * time in the future.
3543                  */
3544                 TALLOC_FREE(rec->force_rebalance_nodes);
3545         }
3546
3547         /* This is a special case.  When recovery daemon is started, recmaster
3548          * is set to -1.  If a node is not started in stopped state, then
3549          * start election to decide recovery master
3550          */
3551         if (rec->recmaster == (uint32_t)-1) {
3552                 DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
3553                 force_election(rec, pnn, nodemap);
3554                 return;
3555         }
3556
3557         /* update the capabilities for all nodes */
3558         ret = update_capabilities(rec, nodemap);
3559         if (ret != 0) {
3560                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3561                 return;
3562         }
3563
3564         /*
3565          * If the current recmaster does not have CTDB_CAP_RECMASTER,
3566          * but we have, then force an election and try to become the new
3567          * recmaster.
3568          */
3569         if (!ctdb_node_has_capabilities(rec->caps,
3570                                         rec->recmaster,
3571                                         CTDB_CAP_RECMASTER) &&
3572             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3573             !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3574                 DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
3575                                   " but we (node %u) have - force an election\n",
3576                                   rec->recmaster, pnn));
3577                 force_election(rec, pnn, nodemap);
3578                 return;
3579         }
3580
3581         /* verify that the recmaster node is still active */
3582         for (j=0; j<nodemap->num; j++) {
3583                 if (nodemap->nodes[j].pnn==rec->recmaster) {
3584                         break;
3585                 }
3586         }
3587
3588         if (j == nodemap->num) {
3589                 DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
3590                 force_election(rec, pnn, nodemap);
3591                 return;
3592         }
3593
3594         /* if recovery master is disconnected we must elect a new recmaster */
3595         if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
3596                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
3597                 force_election(rec, pnn, nodemap);
3598                 return;
3599         }
3600
3601         /* get nodemap from the recovery master to check if it is inactive */
3602         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3603                                    mem_ctx, &recmaster_nodemap);
3604         if (ret != 0) {
3605                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n",
3606                           nodemap->nodes[j].pnn));
3607                 return;
3608         }
3609
3610
3611         if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
3612             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3613                 DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
3614                 /*
3615                  * update our nodemap to carry the recmaster's notion of
3616                  * its own flags, so that we don't keep freezing the
3617                  * inactive recmaster node...
3618                  */
3619                 nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
3620                 force_election(rec, pnn, nodemap);
3621                 return;
3622         }
3623
3624         /* verify that we have all ip addresses we should have and we dont
3625          * have addresses we shouldnt have.
3626          */
3627         if (ctdb->tunable.disable_ip_failover == 0 &&
3628             !ctdb_op_is_disabled(rec->takeover_run)) {
3629                 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3630                         DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3631                 }
3632         }
3633
3634
3635         /* if we are not the recmaster then we do not need to check
3636            if recovery is needed
3637          */
3638         if (pnn != rec->recmaster) {
3639                 return;
3640         }
3641
3642
3643         /* ensure our local copies of flags are right */
3644         ret = update_local_flags(rec, nodemap);
3645         if (ret == MONITOR_ELECTION_NEEDED) {
3646                 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3647                 force_election(rec, pnn, nodemap);
3648                 return;
3649         }
3650         if (ret != MONITOR_OK) {
3651                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3652                 return;
3653         }
3654
3655         if (ctdb->num_nodes != nodemap->num) {
3656                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3657                 ctdb_load_nodes_file(ctdb);
3658                 return;
3659         }
3660
3661         /* verify that all active nodes agree that we are the recmaster */
3662         switch (verify_recmaster(rec, nodemap, pnn)) {
3663         case MONITOR_RECOVERY_NEEDED:
3664                 /* can not happen */
3665                 return;
3666         case MONITOR_ELECTION_NEEDED:
3667                 force_election(rec, pnn, nodemap);
3668                 return;
3669         case MONITOR_OK:
3670                 break;
3671         case MONITOR_FAILED:
3672                 return;
3673         }
3674
3675
3676         if (rec->need_recovery) {
3677                 /* a previous recovery didn't finish */
3678                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3679                 return;
3680         }
3681
3682         /* verify that all active nodes are in normal mode
3683            and not in recovery mode
3684         */
3685         switch (verify_recmode(ctdb, nodemap)) {
3686         case MONITOR_RECOVERY_NEEDED:
3687                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3688                 return;
3689         case MONITOR_FAILED:
3690                 return;
3691         case MONITOR_ELECTION_NEEDED:
3692                 /* can not happen */
3693         case MONITOR_OK:
3694                 break;
3695         }
3696
3697
3698         if (ctdb->recovery_lock_file != NULL) {
3699                 /* We must already hold the recovery lock */
3700                 if (!ctdb_recovery_have_lock(ctdb)) {
3701                         DEBUG(DEBUG_ERR,("Failed recovery lock sanity check.  Force a recovery\n"));
3702                         ctdb_set_culprit(rec, ctdb->pnn);
3703                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3704                         return;
3705                 }
3706         }
3707
3708
3709         /* if there are takeovers requested, perform it and notify the waiters */
3710         if (!ctdb_op_is_disabled(rec->takeover_run) &&
3711             rec->reallocate_requests) {
3712                 process_ipreallocate_requests(ctdb, rec);
3713         }
3714
3715         /* If recoveries are disabled then there is no use doing any
3716          * nodemap or flags checks.  Recoveries might be disabled due
3717          * to "reloadnodes", so doing these checks might cause an
3718          * unnecessary recovery.  */
3719         if (ctdb_op_is_disabled(rec->recovery)) {
3720                 return;
3721         }
3722
3723         /* get the nodemap for all active remote nodes
3724          */
3725         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
3726         if (remote_nodemaps == NULL) {
3727                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3728                 return;
3729         }
3730         for(i=0; i<nodemap->num; i++) {
3731                 remote_nodemaps[i] = NULL;
3732         }
3733         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3734                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3735                 return;
3736         }
3737
3738         /* verify that all other nodes have the same nodemap as we have
3739         */
3740         for (j=0; j<nodemap->num; j++) {
3741                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3742                         continue;
3743                 }
3744
3745                 if (remote_nodemaps[j] == NULL) {
3746                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3747                         ctdb_set_culprit(rec, j);
3748
3749                         return;
3750                 }
3751
3752                 /* if the nodes disagree on how many nodes there are
3753                    then this is a good reason to try recovery
3754                  */
3755                 if (remote_nodemaps[j]->num != nodemap->num) {
3756                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3757                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3758                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3759                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3760                         return;
3761                 }
3762
3763                 /* if the nodes disagree on which nodes exist and are
3764                    active, then that is also a good reason to do recovery
3765                  */
3766                 for (i=0;i<nodemap->num;i++) {
3767                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3768                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3769                                           nodemap->nodes[j].pnn, i,
3770                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3771                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3772                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3773                                             vnnmap);
3774                                 return;
3775                         }
3776                 }
3777         }
3778
3779         /*
3780          * Update node flags obtained from each active node. This ensure we have
3781          * up-to-date information for all the nodes.
3782          */
3783         for (j=0; j<nodemap->num; j++) {
3784                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3785                         continue;
3786                 }
3787                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3788         }
3789
3790         for (j=0; j<nodemap->num; j++) {
3791                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3792                         continue;
3793                 }
3794
3795                 /* verify the flags are consistent
3796                 */
3797                 for (i=0; i<nodemap->num; i++) {
3798                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3799                                 continue;
3800                         }
3801
3802                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3803                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3804                                   nodemap->nodes[j].pnn,
3805                                   nodemap->nodes[i].pnn,
3806                                   remote_nodemaps[j]->nodes[i].flags,
3807                                   nodemap->nodes[i].flags));
3808                                 if (i == j) {
3809                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3810                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3811                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3812                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3813                                                     vnnmap);
3814                                         return;
3815                                 } else {
3816                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3817                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3818                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3819                                         do_recovery(rec, mem_ctx, pnn, nodemap,
3820                                                     vnnmap);
3821                                         return;
3822                                 }
3823                         }
3824                 }
3825         }
3826
3827
3828         /* count how many active nodes there are */
3829         num_lmasters  = 0;
3830         for (i=0; i<nodemap->num; i++) {
3831                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3832                         if (ctdb_node_has_capabilities(rec->caps,
3833                                                        ctdb->nodes[i]->pnn,
3834                                                        CTDB_CAP_LMASTER)) {
3835                                 num_lmasters++;
3836                         }
3837                 }
3838         }
3839
3840
3841         /* There must be the same number of lmasters in the vnn map as
3842          * there are active nodes with the lmaster capability...  or
3843          * do a recovery.
3844          */
3845         if (vnnmap->size != num_lmasters) {
3846                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3847                           vnnmap->size, num_lmasters));
3848                 ctdb_set_culprit(rec, ctdb->pnn);
3849                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3850                 return;
3851         }
3852
3853         /* verify that all active nodes in the nodemap also exist in
3854            the vnnmap.
3855          */
3856         for (j=0; j<nodemap->num; j++) {
3857                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3858                         continue;
3859                 }
3860                 if (nodemap->nodes[j].pnn == pnn) {
3861                         continue;
3862                 }
3863
3864                 for (i=0; i<vnnmap->size; i++) {
3865                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3866                                 break;
3867                         }
3868                 }
3869                 if (i == vnnmap->size) {
3870                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3871                                   nodemap->nodes[j].pnn));
3872                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3873                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3874                         return;
3875                 }
3876         }
3877
3878
3879         /* verify that all other nodes have the same vnnmap
3880            and are from the same generation
3881          */
3882         for (j=0; j<nodemap->num; j++) {
3883                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3884                         continue;
3885                 }
3886                 if (nodemap->nodes[j].pnn == pnn) {
3887                         continue;
3888                 }
3889
3890                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3891                                           mem_ctx, &remote_vnnmap);
3892                 if (ret != 0) {
3893                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3894                                   nodemap->nodes[j].pnn));
3895                         return;
3896                 }
3897
3898                 /* verify the vnnmap generation is the same */
3899                 if (vnnmap->generation != remote_vnnmap->generation) {
3900                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3901                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3902                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3903                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3904                         return;
3905                 }
3906
3907                 /* verify the vnnmap size is the same */
3908                 if (vnnmap->size != remote_vnnmap->size) {
3909                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3910                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3911                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3912                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3913                         return;
3914                 }
3915
3916                 /* verify the vnnmap is the same */
3917                 for (i=0;i<vnnmap->size;i++) {
3918                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3919                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3920                                           nodemap->nodes[j].pnn));
3921                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3922                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3923                                             vnnmap);
3924                                 return;
3925                         }
3926                 }
3927         }
3928
3929         /* we might need to change who has what IP assigned */
3930         if (rec->need_takeover_run) {
3931                 uint32_t culprit = (uint32_t)-1;
3932
3933                 rec->need_takeover_run = false;
3934
3935                 /* update the list of public ips that a node can handle for
3936                    all connected nodes
3937                 */
3938                 ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
3939                 if (ret != 0) {
3940                         DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
3941                                          culprit));
3942                         rec->need_takeover_run = true;
3943                         return;
3944                 }
3945
3946                 /* execute the "startrecovery" event script on all nodes */
3947                 ret = run_startrecovery_eventscript(rec, nodemap);
3948                 if (ret!=0) {
3949                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
3950                         ctdb_set_culprit(rec, ctdb->pnn);
3951                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3952                         return;
3953                 }
3954
3955                 /* If takeover run fails, then the offending nodes are
3956                  * assigned ban culprit counts. And we re-try takeover.
3957                  * If takeover run fails repeatedly, the node would get
3958                  * banned.
3959                  *
3960                  * If rec->need_takeover_run is not set to true at this
3961                  * failure, monitoring is disabled cluster-wide (via
3962                  * startrecovery eventscript) and will not get enabled.
3963                  */
3964                 if (!do_takeover_run(rec, nodemap, true)) {
3965                         return;
3966                 }
3967
3968                 /* execute the "recovered" event script on all nodes */
3969                 ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
3970 #if 0
3971 // we cant check whether the event completed successfully
3972 // since this script WILL fail if the node is in recovery mode
3973 // and if that race happens, the code here would just cause a second
3974 // cascading recovery.
3975                 if (ret!=0) {
3976                         DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
3977                         ctdb_set_culprit(rec, ctdb->pnn);
3978                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3979                 }
3980 #endif
3981         }
3982 }
3983
3984 /*
3985   the main monitoring loop
3986  */
3987 static void monitor_cluster(struct ctdb_context *ctdb)
3988 {
3989         struct ctdb_recoverd *rec;
3990
3991         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3992
3993         rec = talloc_zero(ctdb, struct ctdb_recoverd);
3994         CTDB_NO_MEMORY_FATAL(ctdb, rec);
3995
3996         rec->ctdb = ctdb;
3997
3998         rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3999         CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
4000
4001         rec->recovery = ctdb_op_init(rec, "recoveries");
4002         CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
4003
4004         rec->priority_time = timeval_current();
4005
4006         /* register a message port for sending memory dumps */
4007         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
4008
4009         /* register a message port for recovery elections */
4010         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
4011
4012         /* when nodes are disabled/enabled */
4013         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
4014
4015         /* when we are asked to puch out a flag change */
4016         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
4017
4018         /* register a message port for vacuum fetch */
4019         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
4020
4021         /* register a message port for reloadnodes  */
4022         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
4023
4024         /* register a message port for performing a takeover run */
4025         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
4026
4027         /* register a message port for disabling the ip check for a short while */
4028         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
4029
4030         /* register a message port for updating the recovery daemons node assignment for an ip */
4031         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
4032
4033         /* register a message port for forcing a rebalance of a node next
4034            reallocation */
4035         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
4036
4037         /* Register a message port for disabling takeover runs */
4038         ctdb_client_set_message_handler(ctdb,
4039                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
4040                                         disable_takeover_runs_handler, rec);
4041
4042         /* Register a message port for disabling recoveries */
4043         ctdb_client_set_message_handler(ctdb,
4044                                         CTDB_SRVID_DISABLE_RECOVERIES,
4045                                         disable_recoveries_handler, rec);
4046
4047         /* register a message port for detaching database */
4048         ctdb_client_set_message_handler(ctdb,
4049                                         CTDB_SRVID_DETACH_DATABASE,
4050                                         detach_database_handler, rec);
4051
4052         for (;;) {
4053                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4054                 struct timeval start;
4055                 double elapsed;
4056
4057                 if (!mem_ctx) {
4058                         DEBUG(DEBUG_CRIT,(__location__
4059                                           " Failed to create temp context\n"));
4060                         exit(-1);
4061                 }
4062
4063                 start = timeval_current();
4064                 main_loop(ctdb, rec, mem_ctx);
4065                 talloc_free(mem_ctx);
4066
4067                 /* we only check for recovery once every second */
4068                 elapsed = timeval_elapsed(&start);
4069                 if (elapsed < ctdb->tunable.recover_interval) {
4070                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
4071                                           - elapsed);
4072                 }
4073         }
4074 }
4075
4076 /*
4077   event handler for when the main ctdbd dies
4078  */
4079 static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde,
4080                                  uint16_t flags, void *private_data)
4081 {
4082         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4083         _exit(1);
4084 }
4085
4086 /*
4087   called regularly to verify that the recovery daemon is still running
4088  */
4089 static void ctdb_check_recd(struct event_context *ev, struct timed_event *te,
4090                               struct timeval yt, void *p)
4091 {
4092         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4093
4094         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4095                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4096
4097                 event_add_timed(ctdb->ev, ctdb, timeval_zero(),
4098                                 ctdb_restart_recd, ctdb);
4099
4100                 return;
4101         }
4102
4103         event_add_timed(ctdb->ev, ctdb->recd_ctx,
4104                         timeval_current_ofs(30, 0),
4105                         ctdb_check_recd, ctdb);
4106 }
4107
4108 static void recd_sig_child_handler(struct event_context *ev,
4109         struct signal_event *se, int signum, int count,
4110         void *dont_care,
4111         void *private_data)
4112 {
4113 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4114         int status;
4115         pid_t pid = -1;
4116
4117         while (pid != 0) {
4118                 pid = waitpid(-1, &status, WNOHANG);
4119                 if (pid == -1) {
4120                         if (errno != ECHILD) {
4121                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4122                         }
4123                         return;
4124                 }
4125                 if (pid > 0) {
4126                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4127                 }
4128         }
4129 }
4130
4131 /*
4132   startup the recovery daemon as a child of the main ctdb daemon
4133  */
4134 int ctdb_start_recoverd(struct ctdb_context *ctdb)
4135 {
4136         int fd[2];
4137         struct signal_event *se;
4138         struct tevent_fd *fde;
4139
4140         if (pipe(fd) != 0) {
4141                 return -1;
4142         }
4143
4144         ctdb->recoverd_pid = ctdb_fork(ctdb);
4145         if (ctdb->recoverd_pid == -1) {
4146                 return -1;
4147         }
4148
4149         if (ctdb->recoverd_pid != 0) {
4150                 talloc_free(ctdb->recd_ctx);
4151                 ctdb->recd_ctx = talloc_new(ctdb);
4152                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4153
4154                 close(fd[0]);
4155                 event_add_timed(ctdb->ev, ctdb->recd_ctx,
4156                                 timeval_current_ofs(30, 0),
4157                                 ctdb_check_recd, ctdb);
4158                 return 0;
4159         }
4160
4161         close(fd[1]);
4162
4163         srandom(getpid() ^ time(NULL));
4164
4165         ctdb_set_process_name("ctdb_recovered");
4166         if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4167                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4168                 exit(1);
4169         }
4170
4171         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4172
4173         fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
4174                      ctdb_recoverd_parent, &fd[0]);
4175         tevent_fd_set_auto_close(fde);
4176
4177         /* set up a handler to pick up sigchld */
4178         se = event_add_signal(ctdb->ev, ctdb,
4179                                      SIGCHLD, 0,
4180                                      recd_sig_child_handler,
4181                                      ctdb);
4182         if (se == NULL) {
4183                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4184                 exit(1);
4185         }
4186
4187         monitor_cluster(ctdb);
4188
4189         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4190         return -1;
4191 }
4192
4193 /*
4194   shutdown the recovery daemon
4195  */
4196 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4197 {
4198         if (ctdb->recoverd_pid == 0) {
4199                 return;
4200         }
4201
4202         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4203         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4204
4205         TALLOC_FREE(ctdb->recd_ctx);
4206         TALLOC_FREE(ctdb->recd_ping_count);
4207 }
4208
4209 static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te,
4210                        struct timeval t, void *private_data)
4211 {
4212         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4213
4214         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4215         ctdb_stop_recoverd(ctdb);
4216         ctdb_start_recoverd(ctdb);
4217 }