ctdb/server/ctdb_recoverd.c

   1 /*
   2    ctdb recovery daemon
   3
   4    Copyright (C) Ronnie Sahlberg  2007
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/filesys.h"
  22 #include "system/time.h"
  23 #include "system/network.h"
  24 #include "system/wait.h"
  25
  26 #include <popt.h>
  27 #include <talloc.h>
  28 #include <tevent.h>
  29 #include <tdb.h>
  30
  31 #include "lib/tdb_wrap/tdb_wrap.h"
  32 #include "lib/util/dlinklist.h"
  33 #include "lib/util/debug.h"
  34 #include "lib/util/samba_util.h"
  35 #include "lib/util/sys_rw.h"
  36 #include "lib/util/util_process.h"
  37
  38 #include "ctdb_private.h"
  39 #include "ctdb_client.h"
  40
  41 #include "common/system_socket.h"
  42 #include "common/common.h"
  43 #include "common/logging.h"
  44
  45 #include "server/ctdb_config.h"
  46
  47 #include "ctdb_cluster_mutex.h"
  48
  49 /* List of SRVID requests that need to be processed */
  50 struct srvid_list {
  51         struct srvid_list *next, *prev;
  52         struct ctdb_srvid_message *request;
  53 };
  54
  55 struct srvid_requests {
  56         struct srvid_list *requests;
  57 };
  58
  59 static void srvid_request_reply(struct ctdb_context *ctdb,
  60                                 struct ctdb_srvid_message *request,
  61                                 TDB_DATA result)
  62 {
  63         /* Someone that sent srvid==0 does not want a reply */
  64         if (request->srvid == 0) {
  65                 talloc_free(request);
  66                 return;
  67         }
  68
  69         if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
  70                                      result) == 0) {
  71                 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
  72                                   (unsigned)request->pnn,
  73                                   (unsigned long long)request->srvid));
  74         } else {
  75                 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
  76                                  (unsigned)request->pnn,
  77                                  (unsigned long long)request->srvid));
  78         }
  79
  80         talloc_free(request);
  81 }
  82
  83 static void srvid_requests_reply(struct ctdb_context *ctdb,
  84                                  struct srvid_requests **requests,
  85                                  TDB_DATA result)
  86 {
  87         struct srvid_list *r;
  88
  89         if (*requests == NULL) {
  90                 return;
  91         }
  92
  93         for (r = (*requests)->requests; r != NULL; r = r->next) {
  94                 srvid_request_reply(ctdb, r->request, result);
  95         }
  96
  97         /* Free the list structure... */
  98         TALLOC_FREE(*requests);
  99 }
 100
 101 static void srvid_request_add(struct ctdb_context *ctdb,
 102                               struct srvid_requests **requests,
 103                               struct ctdb_srvid_message *request)
 104 {
 105         struct srvid_list *t;
 106         int32_t ret;
 107         TDB_DATA result;
 108
 109         if (*requests == NULL) {
 110                 *requests = talloc_zero(ctdb, struct srvid_requests);
 111                 if (*requests == NULL) {
 112                         goto nomem;
 113                 }
 114         }
 115
 116         t = talloc_zero(*requests, struct srvid_list);
 117         if (t == NULL) {
 118                 /* If *requests was just allocated above then free it */
 119                 if ((*requests)->requests == NULL) {
 120                         TALLOC_FREE(*requests);
 121                 }
 122                 goto nomem;
 123         }
 124
 125         t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
 126         DLIST_ADD((*requests)->requests, t);
 127
 128         return;
 129
 130 nomem:
 131         /* Failed to add the request to the list.  Send a fail. */
 132         DEBUG(DEBUG_ERR, (__location__
 133                           " Out of memory, failed to queue SRVID request\n"));
 134         ret = -ENOMEM;
 135         result.dsize = sizeof(ret);
 136         result.dptr = (uint8_t *)&ret;
 137         srvid_request_reply(ctdb, request, result);
 138 }
 139
 140 /* An abstraction to allow an operation (takeover runs, recoveries,
 141  * ...) to be disabled for a given timeout */
 142 struct ctdb_op_state {
 143         struct tevent_timer *timer;
 144         bool in_progress;
 145         const char *name;
 146 };
 147
 148 static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
 149 {
 150         struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
 151
 152         if (state != NULL) {
 153                 state->in_progress = false;
 154                 state->name = name;
 155         }
 156
 157         return state;
 158 }
 159
 160 static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
 161 {
 162         return state->timer != NULL;
 163 }
 164
 165 static bool ctdb_op_begin(struct ctdb_op_state *state)
 166 {
 167         if (ctdb_op_is_disabled(state)) {
 168                 DEBUG(DEBUG_NOTICE,
 169                       ("Unable to begin - %s are disabled\n", state->name));
 170                 return false;
 171         }
 172
 173         state->in_progress = true;
 174         return true;
 175 }
 176
 177 static bool ctdb_op_end(struct ctdb_op_state *state)
 178 {
 179         return state->in_progress = false;
 180 }
 181
 182 static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
 183 {
 184         return state->in_progress;
 185 }
 186
 187 static void ctdb_op_enable(struct ctdb_op_state *state)
 188 {
 189         TALLOC_FREE(state->timer);
 190 }
 191
 192 static void ctdb_op_timeout_handler(struct tevent_context *ev,
 193                                     struct tevent_timer *te,
 194                                     struct timeval yt, void *p)
 195 {
 196         struct ctdb_op_state *state =
 197                 talloc_get_type(p, struct ctdb_op_state);
 198
 199         DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
 200         ctdb_op_enable(state);
 201 }
 202
 203 static int ctdb_op_disable(struct ctdb_op_state *state,
 204                            struct tevent_context *ev,
 205                            uint32_t timeout)
 206 {
 207         if (timeout == 0) {
 208                 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
 209                 ctdb_op_enable(state);
 210                 return 0;
 211         }
 212
 213         if (state->in_progress) {
 214                 DEBUG(DEBUG_ERR,
 215                       ("Unable to disable %s - in progress\n", state->name));
 216                 return -EAGAIN;
 217         }
 218
 219         DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
 220                             state->name, timeout));
 221
 222         /* Clear any old timers */
 223         talloc_free(state->timer);
 224
 225         /* Arrange for the timeout to occur */
 226         state->timer = tevent_add_timer(ev, state,
 227                                         timeval_current_ofs(timeout, 0),
 228                                         ctdb_op_timeout_handler, state);
 229         if (state->timer == NULL) {
 230                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
 231                 return -ENOMEM;
 232         }
 233
 234         return 0;
 235 }
 236
 237 struct ctdb_banning_state {
 238         uint32_t count;
 239         struct timeval last_reported_time;
 240 };
 241
 242 struct ctdb_recovery_lock_handle;
 243
 244 /*
 245   private state of recovery daemon
 246  */
 247 struct ctdb_recoverd {
 248         struct ctdb_context *ctdb;
 249         uint32_t recmaster;
 250         uint32_t last_culprit_node;
 251         struct ctdb_node_map_old *nodemap;
 252         struct timeval priority_time;
 253         bool need_takeover_run;
 254         bool need_recovery;
 255         uint32_t node_flags;
 256         struct tevent_timer *send_election_te;
 257         struct tevent_timer *election_timeout;
 258         struct srvid_requests *reallocate_requests;
 259         struct ctdb_op_state *takeover_run;
 260         struct ctdb_op_state *recovery;
 261         struct ctdb_iface_list_old *ifaces;
 262         uint32_t *force_rebalance_nodes;
 263         struct ctdb_node_capabilities *caps;
 264         bool frozen_on_inactive;
 265         struct ctdb_recovery_lock_handle *recovery_lock_handle;
 266 };
 267
 268 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
 269 #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
 270
 271 static void ctdb_restart_recd(struct tevent_context *ev,
 272                               struct tevent_timer *te, struct timeval t,
 273                               void *private_data);
 274
 275 /*
 276   ban a node for a period of time
 277  */
 278 static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
 279 {
 280         int ret;
 281         struct ctdb_context *ctdb = rec->ctdb;
 282         struct ctdb_ban_state bantime;
 283
 284         if (!ctdb_validate_pnn(ctdb, pnn)) {
 285                 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
 286                 return;
 287         }
 288
 289         DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
 290
 291         bantime.pnn  = pnn;
 292         bantime.time = ban_time;
 293
 294         ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
 295         if (ret != 0) {
 296                 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
 297                 return;
 298         }
 299
 300 }
 301
 302 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 303
 304
 305 /*
 306   remember the trouble maker
 307  */
 308 static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
 309 {
 310         struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
 311         struct ctdb_banning_state *ban_state;
 312
 313         if (culprit > ctdb->num_nodes) {
 314                 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
 315                 return;
 316         }
 317
 318         /* If we are banned or stopped, do not set other nodes as culprits */
 319         if (rec->node_flags & NODE_FLAGS_INACTIVE) {
 320                 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
 321                 return;
 322         }
 323
 324         if (ctdb->nodes[culprit]->ban_state == NULL) {
 325                 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
 326                 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
 327
 328
 329         }
 330         ban_state = ctdb->nodes[culprit]->ban_state;
 331         if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
 332                 /* this was the first time in a long while this node
 333                    misbehaved so we will forgive any old transgressions.
 334                 */
 335                 ban_state->count = 0;
 336         }
 337
 338         ban_state->count += count;
 339         ban_state->last_reported_time = timeval_current();
 340         rec->last_culprit_node = culprit;
 341 }
 342
 343 /*
 344   remember the trouble maker
 345  */
 346 static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
 347 {
 348         ctdb_set_culprit_count(rec, culprit, 1);
 349 }
 350
 351 /*
 352   Retrieve capabilities from all connected nodes
 353  */
 354 static int update_capabilities(struct ctdb_recoverd *rec,
 355                                struct ctdb_node_map_old *nodemap)
 356 {
 357         uint32_t *capp;
 358         TALLOC_CTX *tmp_ctx;
 359         struct ctdb_node_capabilities *caps;
 360         struct ctdb_context *ctdb = rec->ctdb;
 361
 362         tmp_ctx = talloc_new(rec);
 363         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 364
 365         caps = ctdb_get_capabilities(ctdb, tmp_ctx,
 366                                      CONTROL_TIMEOUT(), nodemap);
 367
 368         if (caps == NULL) {
 369                 DEBUG(DEBUG_ERR,
 370                       (__location__ " Failed to get node capabilities\n"));
 371                 talloc_free(tmp_ctx);
 372                 return -1;
 373         }
 374
 375         capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
 376         if (capp == NULL) {
 377                 DEBUG(DEBUG_ERR,
 378                       (__location__
 379                        " Capabilities don't include current node.\n"));
 380                 talloc_free(tmp_ctx);
 381                 return -1;
 382         }
 383         ctdb->capabilities = *capp;
 384
 385         TALLOC_FREE(rec->caps);
 386         rec->caps = talloc_steal(rec, caps);
 387
 388         talloc_free(tmp_ctx);
 389         return 0;
 390 }
 391
 392 /*
 393   change recovery mode on all nodes
 394  */
 395 static int set_recovery_mode(struct ctdb_context *ctdb,
 396                              struct ctdb_recoverd *rec,
 397                              struct ctdb_node_map_old *nodemap,
 398                              uint32_t rec_mode)
 399 {
 400         TDB_DATA data;
 401         uint32_t *nodes;
 402         TALLOC_CTX *tmp_ctx;
 403
 404         tmp_ctx = talloc_new(ctdb);
 405         CTDB_NO_MEMORY(ctdb, tmp_ctx);
 406
 407         nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
 408
 409         data.dsize = sizeof(uint32_t);
 410         data.dptr = (unsigned char *)&rec_mode;
 411
 412         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
 413                                         nodes, 0,
 414                                         CONTROL_TIMEOUT(),
 415                                         false, data,
 416                                         NULL, NULL,
 417                                         NULL) != 0) {
 418                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
 419                 talloc_free(tmp_ctx);
 420                 return -1;
 421         }
 422
 423         talloc_free(tmp_ctx);
 424         return 0;
 425 }
 426
 427 /*
 428   ensure all other nodes have attached to any databases that we have
 429  */
 430 static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
 431                                            uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
 432 {
 433         int i, j, db, ret;
 434         struct ctdb_dbid_map_old *remote_dbmap;
 435
 436         /* verify that all other nodes have all our databases */
 437         for (j=0; j<nodemap->num; j++) {
 438                 /* we don't need to ourself ourselves */
 439                 if (nodemap->nodes[j].pnn == pnn) {
 440                         continue;
 441                 }
 442                 /* don't check nodes that are unavailable */
 443                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 444                         continue;
 445                 }
 446
 447                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 448                                          mem_ctx, &remote_dbmap);
 449                 if (ret != 0) {
 450                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 451                         return -1;
 452                 }
 453
 454                 /* step through all local databases */
 455                 for (db=0; db<dbmap->num;db++) {
 456                         const char *name;
 457
 458
 459                         for (i=0;i<remote_dbmap->num;i++) {
 460                                 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
 461                                         break;
 462                                 }
 463                         }
 464                         /* the remote node already have this database */
 465                         if (i!=remote_dbmap->num) {
 466                                 continue;
 467                         }
 468                         /* ok so we need to create this database */
 469                         ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
 470                                                   dbmap->dbs[db].db_id, mem_ctx,
 471                                                   &name);
 472                         if (ret != 0) {
 473                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
 474                                 return -1;
 475                         }
 476                         ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
 477                                                  nodemap->nodes[j].pnn,
 478                                                  mem_ctx, name,
 479                                                  dbmap->dbs[db].flags, NULL);
 480                         if (ret != 0) {
 481                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
 482                                 return -1;
 483                         }
 484                 }
 485         }
 486
 487         return 0;
 488 }
 489
 490
 491 /*
 492   ensure we are attached to any databases that anyone else is attached to
 493  */
 494 static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
 495                                           uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
 496 {
 497         int i, j, db, ret;
 498         struct ctdb_dbid_map_old *remote_dbmap;
 499
 500         /* verify that we have all database any other node has */
 501         for (j=0; j<nodemap->num; j++) {
 502                 /* we don't need to ourself ourselves */
 503                 if (nodemap->nodes[j].pnn == pnn) {
 504                         continue;
 505                 }
 506                 /* don't check nodes that are unavailable */
 507                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
 508                         continue;
 509                 }
 510
 511                 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 512                                          mem_ctx, &remote_dbmap);
 513                 if (ret != 0) {
 514                         DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
 515                         return -1;
 516                 }
 517
 518                 /* step through all databases on the remote node */
 519                 for (db=0; db<remote_dbmap->num;db++) {
 520                         const char *name;
 521
 522                         for (i=0;i<(*dbmap)->num;i++) {
 523                                 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
 524                                         break;
 525                                 }
 526                         }
 527                         /* we already have this db locally */
 528                         if (i!=(*dbmap)->num) {
 529                                 continue;
 530                         }
 531                         /* ok so we need to create this database and
 532                            rebuild dbmap
 533                          */
 534                         ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 535                                             remote_dbmap->dbs[db].db_id, mem_ctx, &name);
 536                         if (ret != 0) {
 537                                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
 538                                           nodemap->nodes[j].pnn));
 539                                 return -1;
 540                         }
 541                         ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn,
 542                                            mem_ctx, name,
 543                                            remote_dbmap->dbs[db].flags, NULL);
 544                         if (ret != 0) {
 545                                 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
 546                                 return -1;
 547                         }
 548                         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
 549                         if (ret != 0) {
 550                                 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
 551                                 return -1;
 552                         }
 553                 }
 554         }
 555
 556         return 0;
 557 }
 558
 559 /*
 560   update flags on all active nodes
 561  */
 562 static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
 563 {
 564         int ret;
 565
 566         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
 567                 if (ret != 0) {
 568                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 569                 return -1;
 570         }
 571
 572         return 0;
 573 }
 574
 575 /*
 576   called when a vacuum fetch has completed - just free it and do the next one
 577  */
 578 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 579 {
 580         talloc_free(state);
 581 }
 582
 583
 584 /**
 585  * Process one elements of the vacuum fetch list:
 586  * Migrate it over to us with the special flag
 587  * CTDB_CALL_FLAG_VACUUM_MIGRATION.
 588  */
 589 static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
 590                                      uint32_t pnn,
 591                                      struct ctdb_rec_data_old *r)
 592 {
 593         struct ctdb_client_call_state *state;
 594         TDB_DATA data;
 595         struct ctdb_ltdb_header *hdr;
 596         struct ctdb_call call;
 597
 598         ZERO_STRUCT(call);
 599         call.call_id = CTDB_NULL_FUNC;
 600         call.flags = CTDB_IMMEDIATE_MIGRATION;
 601         call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
 602
 603         call.key.dptr = &r->data[0];
 604         call.key.dsize = r->keylen;
 605
 606         /* ensure we don't block this daemon - just skip a record if we can't get
 607            the chainlock */
 608         if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
 609                 return true;
 610         }
 611
 612         data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
 613         if (data.dptr == NULL) {
 614                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
 615                 return true;
 616         }
 617
 618         if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
 619                 free(data.dptr);
 620                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
 621                 return true;
 622         }
 623
 624         hdr = (struct ctdb_ltdb_header *)data.dptr;
 625         if (hdr->dmaster == pnn) {
 626                 /* its already local */
 627                 free(data.dptr);
 628                 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
 629                 return true;
 630         }
 631
 632         free(data.dptr);
 633
 634         state = ctdb_call_send(ctdb_db, &call);
 635         tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
 636         if (state == NULL) {
 637                 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
 638                 return false;
 639         }
 640         state->async.fn = vacuum_fetch_callback;
 641         state->async.private_data = NULL;
 642
 643         return true;
 644 }
 645
 646
 647 /*
 648   handler for vacuum fetch
 649 */
 650 static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
 651                                  void *private_data)
 652 {
 653         struct ctdb_recoverd *rec = talloc_get_type(
 654                 private_data, struct ctdb_recoverd);
 655         struct ctdb_context *ctdb = rec->ctdb;
 656         struct ctdb_marshall_buffer *recs;
 657         int ret, i;
 658         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
 659         const char *name;
 660         struct ctdb_dbid_map_old *dbmap=NULL;
 661         uint8_t db_flags = 0;
 662         struct ctdb_db_context *ctdb_db;
 663         struct ctdb_rec_data_old *r;
 664
 665         recs = (struct ctdb_marshall_buffer *)data.dptr;
 666
 667         if (recs->count == 0) {
 668                 goto done;
 669         }
 670
 671         /* work out if the database is persistent */
 672         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
 673         if (ret != 0) {
 674                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
 675                 goto done;
 676         }
 677
 678         for (i=0;i<dbmap->num;i++) {
 679                 if (dbmap->dbs[i].db_id == recs->db_id) {
 680                         db_flags = dbmap->dbs[i].flags;
 681                         break;
 682                 }
 683         }
 684         if (i == dbmap->num) {
 685                 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
 686                 goto done;
 687         }
 688
 689         /* find the name of this database */
 690         if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
 691                 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
 692                 goto done;
 693         }
 694
 695         /* attach to it */
 696         ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, db_flags);
 697         if (ctdb_db == NULL) {
 698                 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
 699                 goto done;
 700         }
 701
 702         r = (struct ctdb_rec_data_old *)&recs->data[0];
 703         while (recs->count) {
 704                 bool ok;
 705
 706                 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
 707                 if (!ok) {
 708                         break;
 709                 }
 710
 711                 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
 712                 recs->count--;
 713         }
 714
 715 done:
 716         talloc_free(tmp_ctx);
 717 }
 718
 719
 720 /*
 721  * handler for database detach
 722  */
 723 static void detach_database_handler(uint64_t srvid, TDB_DATA data,
 724                                     void *private_data)
 725 {
 726         struct ctdb_recoverd *rec = talloc_get_type(
 727                 private_data, struct ctdb_recoverd);
 728         struct ctdb_context *ctdb = rec->ctdb;
 729         uint32_t db_id;
 730         struct ctdb_db_context *ctdb_db;
 731
 732         if (data.dsize != sizeof(db_id)) {
 733                 return;
 734         }
 735         db_id = *(uint32_t *)data.dptr;
 736
 737         ctdb_db = find_ctdb_db(ctdb, db_id);
 738         if (ctdb_db == NULL) {
 739                 /* database is not attached */
 740                 return;
 741         }
 742
 743         DLIST_REMOVE(ctdb->db_list, ctdb_db);
 744
 745         DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
 746                              ctdb_db->db_name));
 747         talloc_free(ctdb_db);
 748 }
 749
 750 /*
 751   called when ctdb_wait_timeout should finish
 752  */
 753 static void ctdb_wait_handler(struct tevent_context *ev,
 754                               struct tevent_timer *te,
 755                               struct timeval yt, void *p)
 756 {
 757         uint32_t *timed_out = (uint32_t *)p;
 758         (*timed_out) = 1;
 759 }
 760
 761 /*
 762   wait for a given number of seconds
 763  */
 764 static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
 765 {
 766         uint32_t timed_out = 0;
 767         time_t usecs = (secs - (time_t)secs) * 1000000;
 768         tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
 769                          ctdb_wait_handler, &timed_out);
 770         while (!timed_out) {
 771                 tevent_loop_once(ctdb->ev);
 772         }
 773 }
 774
 775 /*
 776   called when an election times out (ends)
 777  */
 778 static void ctdb_election_timeout(struct tevent_context *ev,
 779                                   struct tevent_timer *te,
 780                                   struct timeval t, void *p)
 781 {
 782         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
 783         rec->election_timeout = NULL;
 784         fast_start = false;
 785
 786         DEBUG(DEBUG_WARNING,("Election period ended\n"));
 787 }
 788
 789
 790 /*
 791   wait for an election to finish. It finished election_timeout seconds after
 792   the last election packet is received
 793  */
 794 static void ctdb_wait_election(struct ctdb_recoverd *rec)
 795 {
 796         struct ctdb_context *ctdb = rec->ctdb;
 797         while (rec->election_timeout) {
 798                 tevent_loop_once(ctdb->ev);
 799         }
 800 }
 801
 802 /*
 803   Update our local flags from all remote connected nodes.
 804   This is only run when we are or we belive we are the recovery master
 805  */
 806 static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
 807 {
 808         int j;
 809         struct ctdb_context *ctdb = rec->ctdb;
 810         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 811
 812         /* get the nodemap for all active remote nodes and verify
 813            they are the same as for this node
 814          */
 815         for (j=0; j<nodemap->num; j++) {
 816                 struct ctdb_node_map_old *remote_nodemap=NULL;
 817                 int ret;
 818
 819                 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
 820                         continue;
 821                 }
 822                 if (nodemap->nodes[j].pnn == ctdb->pnn) {
 823                         continue;
 824                 }
 825
 826                 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
 827                                            mem_ctx, &remote_nodemap);
 828                 if (ret != 0) {
 829                         DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
 830                                   nodemap->nodes[j].pnn));
 831                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
 832                         talloc_free(mem_ctx);
 833                         return -1;
 834                 }
 835                 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
 836                         /* We should tell our daemon about this so it
 837                            updates its flags or else we will log the same
 838                            message again in the next iteration of recovery.
 839                            Since we are the recovery master we can just as
 840                            well update the flags on all nodes.
 841                         */
 842                         ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
 843                         if (ret != 0) {
 844                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
 845                                 return -1;
 846                         }
 847
 848                         /* Update our local copy of the flags in the recovery
 849                            daemon.
 850                         */
 851                         DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
 852                                  nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
 853                                  nodemap->nodes[j].flags));
 854                         nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
 855                 }
 856                 talloc_free(remote_nodemap);
 857         }
 858         talloc_free(mem_ctx);
 859         return 0;
 860 }
 861
 862
 863 /* Create a new random generation id.
 864    The generation id can not be the INVALID_GENERATION id
 865 */
 866 static uint32_t new_generation(void)
 867 {
 868         uint32_t generation;
 869
 870         while (1) {
 871                 generation = random();
 872
 873                 if (generation != INVALID_GENERATION) {
 874                         break;
 875                 }
 876         }
 877
 878         return generation;
 879 }
 880
 881 static bool ctdb_recovery_have_lock(struct ctdb_recoverd *rec)
 882 {
 883         return (rec->recovery_lock_handle != NULL);
 884 }
 885
 886 struct ctdb_recovery_lock_handle {
 887         bool done;
 888         bool locked;
 889         double latency;
 890         struct ctdb_cluster_mutex_handle *h;
 891 };
 892
 893 static void take_reclock_handler(char status,
 894                                  double latency,
 895                                  void *private_data)
 896 {
 897         struct ctdb_recovery_lock_handle *s =
 898                 (struct ctdb_recovery_lock_handle *) private_data;
 899
 900         s->locked = (status == '0') ;
 901
 902         /*
 903          * If unsuccessful then ensure the process has exited and that
 904          * the file descriptor event handler has been cancelled
 905          */
 906         if (! s->locked) {
 907                 TALLOC_FREE(s->h);
 908         }
 909
 910         switch (status) {
 911         case '0':
 912                 s->latency = latency;
 913                 break;
 914
 915         case '1':
 916                 D_ERR("Unable to take recovery lock - contention\n");
 917                 break;
 918
 919         case '2':
 920                 D_ERR("Unable to take recovery lock - timeout\n");
 921                 break;
 922
 923         default:
 924                 D_ERR("Unable to take recover lock - unknown error\n");
 925         }
 926
 927         s->done = true;
 928 }
 929
 930 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec);
 931
 932 static void lost_reclock_handler(void *private_data)
 933 {
 934         struct ctdb_recoverd *rec = talloc_get_type_abort(
 935                 private_data, struct ctdb_recoverd);
 936
 937         DEBUG(DEBUG_ERR,
 938               ("Recovery lock helper terminated unexpectedly - "
 939                "trying to retake recovery lock\n"));
 940         TALLOC_FREE(rec->recovery_lock_handle);
 941         if (! ctdb_recovery_lock(rec)) {
 942                 DEBUG(DEBUG_ERR, ("Failed to take recovery lock\n"));
 943         }
 944 }
 945
 946 static bool ctdb_recovery_lock(struct ctdb_recoverd *rec)
 947 {
 948         struct ctdb_context *ctdb = rec->ctdb;
 949         struct ctdb_cluster_mutex_handle *h;
 950         struct ctdb_recovery_lock_handle *s;
 951
 952         s = talloc_zero(rec, struct ctdb_recovery_lock_handle);
 953         if (s == NULL) {
 954                 DBG_ERR("Memory allocation error\n");
 955                 return false;
 956         };
 957
 958         h = ctdb_cluster_mutex(s,
 959                                ctdb,
 960                                ctdb->recovery_lock,
 961                                0,
 962                                take_reclock_handler,
 963                                s,
 964                                lost_reclock_handler,
 965                                rec);
 966         if (h == NULL) {
 967                 talloc_free(s);
 968                 return false;
 969         }
 970
 971         rec->recovery_lock_handle = s;
 972         s->h = h;
 973
 974         while (! s->done) {
 975                 tevent_loop_once(ctdb->ev);
 976         }
 977
 978         if (! s->locked) {
 979                 TALLOC_FREE(rec->recovery_lock_handle);
 980                 return false;
 981         }
 982
 983         ctdb_ctrl_report_recd_lock_latency(ctdb,
 984                                            CONTROL_TIMEOUT(),
 985                                            s->latency);
 986
 987         return true;
 988 }
 989
 990 static void ctdb_recovery_unlock(struct ctdb_recoverd *rec)
 991 {
 992         if (rec->recovery_lock_handle == NULL) {
 993                 return;
 994         }
 995
 996         if (! rec->recovery_lock_handle->done) {
 997                 /*
 998                  * Taking of recovery lock still in progress.  Free
 999                  * the cluster mutex handle to release it but leave
1000                  * the recovery lock handle in place to allow taking
1001                  * of the lock to fail.
1002                  */
1003                 D_NOTICE("Cancelling recovery lock\n");
1004                 TALLOC_FREE(rec->recovery_lock_handle->h);
1005                 rec->recovery_lock_handle->done = true;
1006                 rec->recovery_lock_handle->locked = false;
1007                 return;
1008         }
1009
1010         D_NOTICE("Releasing recovery lock\n");
1011         TALLOC_FREE(rec->recovery_lock_handle);
1012 }
1013
1014 static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1015 {
1016         struct ctdb_context *ctdb = rec->ctdb;
1017         int i;
1018         struct ctdb_banning_state *ban_state;
1019
1020         *self_ban = false;
1021         for (i=0; i<ctdb->num_nodes; i++) {
1022                 if (ctdb->nodes[i]->ban_state == NULL) {
1023                         continue;
1024                 }
1025                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1026                 if (ban_state->count < 2*ctdb->num_nodes) {
1027                         continue;
1028                 }
1029
1030                 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1031                         ctdb->nodes[i]->pnn, ban_state->count,
1032                         ctdb->tunable.recovery_ban_period));
1033                 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1034                 ban_state->count = 0;
1035
1036                 /* Banning ourself? */
1037                 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1038                         *self_ban = true;
1039                 }
1040         }
1041 }
1042
1043 struct helper_state {
1044         int fd[2];
1045         pid_t pid;
1046         int result;
1047         bool done;
1048 };
1049
1050 static void helper_handler(struct tevent_context *ev,
1051                            struct tevent_fd *fde,
1052                            uint16_t flags, void *private_data)
1053 {
1054         struct helper_state *state = talloc_get_type_abort(
1055                 private_data, struct helper_state);
1056         int ret;
1057
1058         ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1059         if (ret != sizeof(state->result)) {
1060                 state->result = EPIPE;
1061         }
1062
1063         state->done = true;
1064 }
1065
1066 static int helper_run(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1067                       const char *prog, const char *arg, const char *type)
1068 {
1069         struct helper_state *state;
1070         struct tevent_fd *fde;
1071         const char **args;
1072         int nargs, ret;
1073         uint32_t recmaster = rec->recmaster;
1074
1075         state = talloc_zero(mem_ctx, struct helper_state);
1076         if (state == NULL) {
1077                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1078                 return -1;
1079         }
1080
1081         state->pid = -1;
1082
1083         ret = pipe(state->fd);
1084         if (ret != 0) {
1085                 DEBUG(DEBUG_ERR,
1086                       ("Failed to create pipe for %s helper\n", type));
1087                 goto fail;
1088         }
1089
1090         set_close_on_exec(state->fd[0]);
1091
1092         nargs = 4;
1093         args = talloc_array(state, const char *, nargs);
1094         if (args == NULL) {
1095                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1096                 goto fail;
1097         }
1098
1099         args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1100         if (args[0] == NULL) {
1101                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1102                 goto fail;
1103         }
1104         args[1] = rec->ctdb->daemon.name;
1105         args[2] = arg;
1106         args[3] = NULL;
1107
1108         if (args[2] == NULL) {
1109                 nargs = 3;
1110         }
1111
1112         state->pid = ctdb_vfork_exec(state, rec->ctdb, prog, nargs, args);
1113         if (state->pid == -1) {
1114                 DEBUG(DEBUG_ERR,
1115                       ("Failed to create child for %s helper\n", type));
1116                 goto fail;
1117         }
1118
1119         close(state->fd[1]);
1120         state->fd[1] = -1;
1121
1122         state->done = false;
1123
1124         fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1125                             TEVENT_FD_READ, helper_handler, state);
1126         if (fde == NULL) {
1127                 goto fail;
1128         }
1129         tevent_fd_set_auto_close(fde);
1130
1131         while (!state->done) {
1132                 tevent_loop_once(rec->ctdb->ev);
1133
1134                 /* If recmaster changes, we have lost election */
1135                 if (recmaster != rec->recmaster) {
1136                         D_ERR("Recmaster changed to %u, aborting %s\n",
1137                               rec->recmaster, type);
1138                         state->result = 1;
1139                         break;
1140                 }
1141         }
1142
1143         close(state->fd[0]);
1144         state->fd[0] = -1;
1145
1146         if (state->result != 0) {
1147                 goto fail;
1148         }
1149
1150         ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1151         talloc_free(state);
1152         return 0;
1153
1154 fail:
1155         if (state->fd[0] != -1) {
1156                 close(state->fd[0]);
1157         }
1158         if (state->fd[1] != -1) {
1159                 close(state->fd[1]);
1160         }
1161         if (state->pid != -1) {
1162                 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1163         }
1164         talloc_free(state);
1165         return -1;
1166 }
1167
1168
1169 static int ctdb_takeover(struct ctdb_recoverd *rec,
1170                          uint32_t *force_rebalance_nodes)
1171 {
1172         static char prog[PATH_MAX+1] = "";
1173         char *arg;
1174         int i, ret;
1175
1176         if (!ctdb_set_helper("takeover_helper", prog, sizeof(prog),
1177                              "CTDB_TAKEOVER_HELPER", CTDB_HELPER_BINDIR,
1178                              "ctdb_takeover_helper")) {
1179                 ctdb_die(rec->ctdb, "Unable to set takeover helper\n");
1180         }
1181
1182         arg = NULL;
1183         for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1184                 uint32_t pnn = force_rebalance_nodes[i];
1185                 if (arg == NULL) {
1186                         arg = talloc_asprintf(rec, "%u", pnn);
1187                 } else {
1188                         arg = talloc_asprintf_append(arg, ",%u", pnn);
1189                 }
1190                 if (arg == NULL) {
1191                         DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1192                         return -1;
1193                 }
1194         }
1195
1196         if (ctdb_config.failover_disabled) {
1197                 ret = setenv("CTDB_DISABLE_IP_FAILOVER", "1", 1);
1198                 if (ret != 0) {
1199                         D_ERR("Failed to set CTDB_DISABLE_IP_FAILOVER variable\n");
1200                         return -1;
1201                 }
1202         }
1203
1204         return helper_run(rec, rec, prog, arg, "takeover");
1205 }
1206
1207 static bool do_takeover_run(struct ctdb_recoverd *rec,
1208                             struct ctdb_node_map_old *nodemap)
1209 {
1210         uint32_t *nodes = NULL;
1211         struct ctdb_disable_message dtr;
1212         TDB_DATA data;
1213         int i;
1214         uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1215         int ret;
1216         bool ok;
1217
1218         DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1219
1220         if (ctdb_op_is_in_progress(rec->takeover_run)) {
1221                 DEBUG(DEBUG_ERR, (__location__
1222                                   " takeover run already in progress \n"));
1223                 ok = false;
1224                 goto done;
1225         }
1226
1227         if (!ctdb_op_begin(rec->takeover_run)) {
1228                 ok = false;
1229                 goto done;
1230         }
1231
1232         /* Disable IP checks (takeover runs, really) on other nodes
1233          * while doing this takeover run.  This will stop those other
1234          * nodes from triggering takeover runs when think they should
1235          * be hosting an IP but it isn't yet on an interface.  Don't
1236          * wait for replies since a failure here might cause some
1237          * noise in the logs but will not actually cause a problem.
1238          */
1239         ZERO_STRUCT(dtr);
1240         dtr.srvid = 0; /* No reply */
1241         dtr.pnn = -1;
1242
1243         data.dptr  = (uint8_t*)&dtr;
1244         data.dsize = sizeof(dtr);
1245
1246         nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1247
1248         /* Disable for 60 seconds.  This can be a tunable later if
1249          * necessary.
1250          */
1251         dtr.timeout = 60;
1252         for (i = 0; i < talloc_array_length(nodes); i++) {
1253                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1254                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1255                                              data) != 0) {
1256                         DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1257                 }
1258         }
1259
1260         ret = ctdb_takeover(rec, rec->force_rebalance_nodes);
1261
1262         /* Reenable takeover runs and IP checks on other nodes */
1263         dtr.timeout = 0;
1264         for (i = 0; i < talloc_array_length(nodes); i++) {
1265                 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1266                                              CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1267                                              data) != 0) {
1268                         DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1269                 }
1270         }
1271
1272         if (ret != 0) {
1273                 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1274                 ok = false;
1275                 goto done;
1276         }
1277
1278         ok = true;
1279         /* Takeover run was successful so clear force rebalance targets */
1280         if (rebalance_nodes == rec->force_rebalance_nodes) {
1281                 TALLOC_FREE(rec->force_rebalance_nodes);
1282         } else {
1283                 DEBUG(DEBUG_WARNING,
1284                       ("Rebalance target nodes changed during takeover run - not clearing\n"));
1285         }
1286 done:
1287         rec->need_takeover_run = !ok;
1288         talloc_free(nodes);
1289         ctdb_op_end(rec->takeover_run);
1290
1291         DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1292         return ok;
1293 }
1294
1295 static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1296 {
1297         static char prog[PATH_MAX+1] = "";
1298         const char *arg;
1299
1300         if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1301                              "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1302                              "ctdb_recovery_helper")) {
1303                 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1304         }
1305
1306         arg = talloc_asprintf(mem_ctx, "%u", new_generation());
1307         if (arg == NULL) {
1308                 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1309                 return -1;
1310         }
1311
1312         setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1313
1314         return helper_run(rec, mem_ctx, prog, arg, "recovery");
1315 }
1316
1317 /*
1318   we are the recmaster, and recovery is needed - start a recovery run
1319  */
1320 static int do_recovery(struct ctdb_recoverd *rec,
1321                        TALLOC_CTX *mem_ctx, uint32_t pnn,
1322                        struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
1323 {
1324         struct ctdb_context *ctdb = rec->ctdb;
1325         int i, ret;
1326         struct ctdb_dbid_map_old *dbmap;
1327         bool self_ban;
1328
1329         DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
1330
1331         /* Check if the current node is still the recmaster.  It's possible that
1332          * re-election has changed the recmaster.
1333          */
1334         if (pnn != rec->recmaster) {
1335                 DEBUG(DEBUG_NOTICE,
1336                       ("Recovery master changed to %u, aborting recovery\n",
1337                        rec->recmaster));
1338                 return -1;
1339         }
1340
1341         /* if recovery fails, force it again */
1342         rec->need_recovery = true;
1343
1344         if (!ctdb_op_begin(rec->recovery)) {
1345                 return -1;
1346         }
1347
1348         if (rec->election_timeout) {
1349                 /* an election is in progress */
1350                 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
1351                 goto fail;
1352         }
1353
1354         ban_misbehaving_nodes(rec, &self_ban);
1355         if (self_ban) {
1356                 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
1357                 goto fail;
1358         }
1359
1360         if (ctdb->recovery_lock != NULL) {
1361                 if (ctdb_recovery_have_lock(rec)) {
1362                         D_NOTICE("Already holding recovery lock\n");
1363                 } else {
1364                         bool ok;
1365
1366                         D_NOTICE("Attempting to take recovery lock (%s)\n",
1367                                  ctdb->recovery_lock);
1368
1369                         ok = ctdb_recovery_lock(rec);
1370                         if (! ok) {
1371                                 D_ERR("Unable to take recovery lock\n");
1372
1373                                 if (pnn != rec->recmaster) {
1374                                         D_NOTICE("Recovery master changed to %u,"
1375                                                  " aborting recovery\n",
1376                                                  rec->recmaster);
1377                                         rec->need_recovery = false;
1378                                         goto fail;
1379                                 }
1380
1381                                 if (ctdb->runstate ==
1382                                     CTDB_RUNSTATE_FIRST_RECOVERY) {
1383                                         /*
1384                                          * First recovery?  Perhaps
1385                                          * current node does not yet
1386                                          * know who the recmaster is.
1387                                          */
1388                                         D_ERR("Retrying recovery\n");
1389                                         goto fail;
1390                                 }
1391
1392                                 D_ERR("Abort recovery, "
1393                                       "ban this node for %u seconds\n",
1394                                       ctdb->tunable.recovery_ban_period);
1395                                 ctdb_ban_node(rec,
1396                                               pnn,
1397                                               ctdb->tunable.recovery_ban_period);
1398                                 goto fail;
1399                         }
1400                         D_NOTICE("Recovery lock taken successfully\n");
1401                 }
1402         }
1403
1404         DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
1405
1406         /* get a list of all databases */
1407         ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
1408         if (ret != 0) {
1409                 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
1410                 goto fail;
1411         }
1412
1413         /* we do the db creation before we set the recovery mode, so the freeze happens
1414            on all databases we will be dealing with. */
1415
1416         /* verify that we have all the databases any other node has */
1417         ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
1418         if (ret != 0) {
1419                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
1420                 goto fail;
1421         }
1422
1423         /* verify that all other nodes have all our databases */
1424         ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
1425         if (ret != 0) {
1426                 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
1427                 goto fail;
1428         }
1429         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
1430
1431
1432         /* Retrieve capabilities from all connected nodes */
1433         ret = update_capabilities(rec, nodemap);
1434         if (ret!=0) {
1435                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
1436                 return -1;
1437         }
1438
1439         /*
1440           update all nodes to have the same flags that we have
1441          */
1442         for (i=0;i<nodemap->num;i++) {
1443                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1444                         continue;
1445                 }
1446
1447                 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
1448                 if (ret != 0) {
1449                         if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1450                                 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
1451                         } else {
1452                                 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
1453                                 return -1;
1454                         }
1455                 }
1456         }
1457
1458         DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
1459
1460         ret = db_recovery_parallel(rec, mem_ctx);
1461         if (ret != 0) {
1462                 goto fail;
1463         }
1464
1465         do_takeover_run(rec, nodemap);
1466
1467         /* send a message to all clients telling them that the cluster
1468            has been reconfigured */
1469         ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
1470                                        CTDB_SRVID_RECONFIGURE, tdb_null);
1471         if (ret != 0) {
1472                 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
1473                 goto fail;
1474         }
1475
1476         DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
1477
1478         rec->need_recovery = false;
1479         ctdb_op_end(rec->recovery);
1480
1481         /* we managed to complete a full recovery, make sure to forgive
1482            any past sins by the nodes that could now participate in the
1483            recovery.
1484         */
1485         DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
1486         for (i=0;i<nodemap->num;i++) {
1487                 struct ctdb_banning_state *ban_state;
1488
1489                 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1490                         continue;
1491                 }
1492
1493                 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
1494                 if (ban_state == NULL) {
1495                         continue;
1496                 }
1497
1498                 ban_state->count = 0;
1499         }
1500
1501         /* We just finished a recovery successfully.
1502            We now wait for rerecovery_timeout before we allow
1503            another recovery to take place.
1504         */
1505         DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be suppressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
1506         ctdb_op_disable(rec->recovery, ctdb->ev,
1507                         ctdb->tunable.rerecovery_timeout);
1508         return 0;
1509
1510 fail:
1511         ctdb_op_end(rec->recovery);
1512         return -1;
1513 }
1514
1515
1516 /*
1517   elections are won by first checking the number of connected nodes, then
1518   the priority time, then the pnn
1519  */
1520 struct election_message {
1521         uint32_t num_connected;
1522         struct timeval priority_time;
1523         uint32_t pnn;
1524         uint32_t node_flags;
1525 };
1526
1527 /*
1528   form this nodes election data
1529  */
1530 static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
1531 {
1532         int ret, i;
1533         struct ctdb_node_map_old *nodemap;
1534         struct ctdb_context *ctdb = rec->ctdb;
1535
1536         ZERO_STRUCTP(em);
1537
1538         em->pnn = rec->ctdb->pnn;
1539         em->priority_time = rec->priority_time;
1540
1541         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
1542         if (ret != 0) {
1543                 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
1544                 return;
1545         }
1546
1547         rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
1548         em->node_flags = rec->node_flags;
1549
1550         for (i=0;i<nodemap->num;i++) {
1551                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
1552                         em->num_connected++;
1553                 }
1554         }
1555
1556         /* we shouldnt try to win this election if we cant be a recmaster */
1557         if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1558                 em->num_connected = 0;
1559                 em->priority_time = timeval_current();
1560         }
1561
1562         talloc_free(nodemap);
1563 }
1564
1565 /*
1566   see if the given election data wins
1567  */
1568 static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
1569 {
1570         struct election_message myem;
1571         int cmp = 0;
1572
1573         ctdb_election_data(rec, &myem);
1574
1575         /* we cant win if we don't have the recmaster capability */
1576         if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
1577                 return false;
1578         }
1579
1580         /* we cant win if we are banned */
1581         if (rec->node_flags & NODE_FLAGS_BANNED) {
1582                 return false;
1583         }
1584
1585         /* we cant win if we are stopped */
1586         if (rec->node_flags & NODE_FLAGS_STOPPED) {
1587                 return false;
1588         }
1589
1590         /* we will automatically win if the other node is banned */
1591         if (em->node_flags & NODE_FLAGS_BANNED) {
1592                 return true;
1593         }
1594
1595         /* we will automatically win if the other node is banned */
1596         if (em->node_flags & NODE_FLAGS_STOPPED) {
1597                 return true;
1598         }
1599
1600         /* then the longest running node */
1601         if (cmp == 0) {
1602                 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
1603         }
1604
1605         if (cmp == 0) {
1606                 cmp = (int)myem.pnn - (int)em->pnn;
1607         }
1608
1609         return cmp > 0;
1610 }
1611
1612 /*
1613   send out an election request
1614  */
1615 static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
1616 {
1617         int ret;
1618         TDB_DATA election_data;
1619         struct election_message emsg;
1620         uint64_t srvid;
1621         struct ctdb_context *ctdb = rec->ctdb;
1622
1623         srvid = CTDB_SRVID_ELECTION;
1624
1625         ctdb_election_data(rec, &emsg);
1626
1627         election_data.dsize = sizeof(struct election_message);
1628         election_data.dptr  = (unsigned char *)&emsg;
1629
1630
1631         /* first we assume we will win the election and set
1632            recoverymaster to be ourself on the current node
1633          */
1634         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1635                                      CTDB_CURRENT_NODE, pnn);
1636         if (ret != 0) {
1637                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
1638                 return -1;
1639         }
1640         rec->recmaster = pnn;
1641
1642         /* send an election message to all active nodes */
1643         DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
1644         return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
1645 }
1646
1647 /*
1648   we think we are winning the election - send a broadcast election request
1649  */
1650 static void election_send_request(struct tevent_context *ev,
1651                                   struct tevent_timer *te,
1652                                   struct timeval t, void *p)
1653 {
1654         struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1655         int ret;
1656
1657         ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
1658         if (ret != 0) {
1659                 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
1660         }
1661
1662         TALLOC_FREE(rec->send_election_te);
1663 }
1664
1665 /*
1666   handler for memory dumps
1667 */
1668 static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1669 {
1670         struct ctdb_recoverd *rec = talloc_get_type(
1671                 private_data, struct ctdb_recoverd);
1672         struct ctdb_context *ctdb = rec->ctdb;
1673         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1674         TDB_DATA *dump;
1675         int ret;
1676         struct ctdb_srvid_message *rd;
1677
1678         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1679                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1680                 talloc_free(tmp_ctx);
1681                 return;
1682         }
1683         rd = (struct ctdb_srvid_message *)data.dptr;
1684
1685         dump = talloc_zero(tmp_ctx, TDB_DATA);
1686         if (dump == NULL) {
1687                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
1688                 talloc_free(tmp_ctx);
1689                 return;
1690         }
1691         ret = ctdb_dump_memory(ctdb, dump);
1692         if (ret != 0) {
1693                 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
1694                 talloc_free(tmp_ctx);
1695                 return;
1696         }
1697
1698 DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
1699
1700         ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
1701         if (ret != 0) {
1702                 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
1703                 talloc_free(tmp_ctx);
1704                 return;
1705         }
1706
1707         talloc_free(tmp_ctx);
1708 }
1709
1710 /*
1711   handler for reload_nodes
1712 */
1713 static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
1714                                  void *private_data)
1715 {
1716         struct ctdb_recoverd *rec = talloc_get_type(
1717                 private_data, struct ctdb_recoverd);
1718
1719         DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
1720
1721         ctdb_load_nodes_file(rec->ctdb);
1722 }
1723
1724
1725 static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
1726                                         void *private_data)
1727 {
1728         struct ctdb_recoverd *rec = talloc_get_type(
1729                 private_data, struct ctdb_recoverd);
1730         struct ctdb_context *ctdb = rec->ctdb;
1731         uint32_t pnn;
1732         uint32_t *t;
1733         int len;
1734
1735         if (rec->recmaster != ctdb_get_pnn(ctdb)) {
1736                 return;
1737         }
1738
1739         if (data.dsize != sizeof(uint32_t)) {
1740                 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
1741                 return;
1742         }
1743
1744         pnn = *(uint32_t *)&data.dptr[0];
1745
1746         DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
1747
1748         /* Copy any existing list of nodes.  There's probably some
1749          * sort of realloc variant that will do this but we need to
1750          * make sure that freeing the old array also cancels the timer
1751          * event for the timeout... not sure if realloc will do that.
1752          */
1753         len = (rec->force_rebalance_nodes != NULL) ?
1754                 talloc_array_length(rec->force_rebalance_nodes) :
1755                 0;
1756
1757         /* This allows duplicates to be added but they don't cause
1758          * harm.  A call to add a duplicate PNN arguably means that
1759          * the timeout should be reset, so this is the simplest
1760          * solution.
1761          */
1762         t = talloc_zero_array(rec, uint32_t, len+1);
1763         CTDB_NO_MEMORY_VOID(ctdb, t);
1764         if (len > 0) {
1765                 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
1766         }
1767         t[len] = pnn;
1768
1769         talloc_free(rec->force_rebalance_nodes);
1770
1771         rec->force_rebalance_nodes = t;
1772 }
1773
1774
1775
1776 static void srvid_disable_and_reply(struct ctdb_context *ctdb,
1777                                     TDB_DATA data,
1778                                     struct ctdb_op_state *op_state)
1779 {
1780         struct ctdb_disable_message *r;
1781         uint32_t timeout;
1782         TDB_DATA result;
1783         int32_t ret = 0;
1784
1785         /* Validate input data */
1786         if (data.dsize != sizeof(struct ctdb_disable_message)) {
1787                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1788                                  "expecting %lu\n", (long unsigned)data.dsize,
1789                                  (long unsigned)sizeof(struct ctdb_srvid_message)));
1790                 return;
1791         }
1792         if (data.dptr == NULL) {
1793                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1794                 return;
1795         }
1796
1797         r = (struct ctdb_disable_message *)data.dptr;
1798         timeout = r->timeout;
1799
1800         ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
1801         if (ret != 0) {
1802                 goto done;
1803         }
1804
1805         /* Returning our PNN tells the caller that we succeeded */
1806         ret = ctdb_get_pnn(ctdb);
1807 done:
1808         result.dsize = sizeof(int32_t);
1809         result.dptr  = (uint8_t *)&ret;
1810         srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
1811 }
1812
1813 static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
1814                                           void *private_data)
1815 {
1816         struct ctdb_recoverd *rec = talloc_get_type(
1817                 private_data, struct ctdb_recoverd);
1818
1819         srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
1820 }
1821
1822 /* Backward compatibility for this SRVID */
1823 static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
1824                                      void *private_data)
1825 {
1826         struct ctdb_recoverd *rec = talloc_get_type(
1827                 private_data, struct ctdb_recoverd);
1828         uint32_t timeout;
1829
1830         if (data.dsize != sizeof(uint32_t)) {
1831                 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
1832                                  "expecting %lu\n", (long unsigned)data.dsize,
1833                                  (long unsigned)sizeof(uint32_t)));
1834                 return;
1835         }
1836         if (data.dptr == NULL) {
1837                 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
1838                 return;
1839         }
1840
1841         timeout = *((uint32_t *)data.dptr);
1842
1843         ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
1844 }
1845
1846 static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
1847                                        void *private_data)
1848 {
1849         struct ctdb_recoverd *rec = talloc_get_type(
1850                 private_data, struct ctdb_recoverd);
1851
1852         srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
1853 }
1854
1855 /*
1856   handler for ip reallocate, just add it to the list of requests and
1857   handle this later in the monitor_cluster loop so we do not recurse
1858   with other requests to takeover_run()
1859 */
1860 static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
1861                                   void *private_data)
1862 {
1863         struct ctdb_srvid_message *request;
1864         struct ctdb_recoverd *rec = talloc_get_type(
1865                 private_data, struct ctdb_recoverd);
1866
1867         if (data.dsize != sizeof(struct ctdb_srvid_message)) {
1868                 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
1869                 return;
1870         }
1871
1872         request = (struct ctdb_srvid_message *)data.dptr;
1873
1874         srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
1875 }
1876
1877 static void process_ipreallocate_requests(struct ctdb_context *ctdb,
1878                                           struct ctdb_recoverd *rec)
1879 {
1880         TDB_DATA result;
1881         int32_t ret;
1882         struct srvid_requests *current;
1883
1884         /* Only process requests that are currently pending.  More
1885          * might come in while the takeover run is in progress and
1886          * they will need to be processed later since they might
1887          * be in response flag changes.
1888          */
1889         current = rec->reallocate_requests;
1890         rec->reallocate_requests = NULL;
1891
1892         if (do_takeover_run(rec, rec->nodemap)) {
1893                 ret = ctdb_get_pnn(ctdb);
1894         } else {
1895                 ret = -1;
1896         }
1897
1898         result.dsize = sizeof(int32_t);
1899         result.dptr  = (uint8_t *)&ret;
1900
1901         srvid_requests_reply(ctdb, &current, result);
1902 }
1903
1904 /*
1905  * handler for assigning banning credits
1906  */
1907 static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1908 {
1909         struct ctdb_recoverd *rec = talloc_get_type(
1910                 private_data, struct ctdb_recoverd);
1911         uint32_t ban_pnn;
1912
1913         /* Ignore if we are not recmaster */
1914         if (rec->ctdb->pnn != rec->recmaster) {
1915                 return;
1916         }
1917
1918         if (data.dsize != sizeof(uint32_t)) {
1919                 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
1920                                   data.dsize));
1921                 return;
1922         }
1923
1924         ban_pnn = *(uint32_t *)data.dptr;
1925
1926         ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
1927 }
1928
1929 /*
1930   handler for recovery master elections
1931 */
1932 static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
1933 {
1934         struct ctdb_recoverd *rec = talloc_get_type(
1935                 private_data, struct ctdb_recoverd);
1936         struct ctdb_context *ctdb = rec->ctdb;
1937         int ret;
1938         struct election_message *em = (struct election_message *)data.dptr;
1939
1940         /* Ignore election packets from ourself */
1941         if (ctdb->pnn == em->pnn) {
1942                 return;
1943         }
1944
1945         /* we got an election packet - update the timeout for the election */
1946         talloc_free(rec->election_timeout);
1947         rec->election_timeout = tevent_add_timer(
1948                         ctdb->ev, ctdb,
1949                         fast_start ?
1950                                 timeval_current_ofs(0, 500000) :
1951                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
1952                         ctdb_election_timeout, rec);
1953
1954         /* someone called an election. check their election data
1955            and if we disagree and we would rather be the elected node,
1956            send a new election message to all other nodes
1957          */
1958         if (ctdb_election_win(rec, em)) {
1959                 if (!rec->send_election_te) {
1960                         rec->send_election_te = tevent_add_timer(
1961                                         ctdb->ev, rec,
1962                                         timeval_current_ofs(0, 500000),
1963                                         election_send_request, rec);
1964                 }
1965                 return;
1966         }
1967
1968         /* we didn't win */
1969         TALLOC_FREE(rec->send_election_te);
1970
1971         /* Release the recovery lock file */
1972         if (ctdb_recovery_have_lock(rec)) {
1973                 ctdb_recovery_unlock(rec);
1974         }
1975
1976         /* ok, let that guy become recmaster then */
1977         ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
1978                                      CTDB_CURRENT_NODE, em->pnn);
1979         if (ret != 0) {
1980                 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
1981                 return;
1982         }
1983         rec->recmaster = em->pnn;
1984
1985         return;
1986 }
1987
1988
1989 /*
1990   force the start of the election process
1991  */
1992 static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
1993                            struct ctdb_node_map_old *nodemap)
1994 {
1995         int ret;
1996         struct ctdb_context *ctdb = rec->ctdb;
1997
1998         DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
1999
2000         /* set all nodes to recovery mode to stop all internode traffic */
2001         ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
2002         if (ret != 0) {
2003                 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2004                 return;
2005         }
2006
2007         talloc_free(rec->election_timeout);
2008         rec->election_timeout = tevent_add_timer(
2009                         ctdb->ev, ctdb,
2010                         fast_start ?
2011                                 timeval_current_ofs(0, 500000) :
2012                                 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2013                         ctdb_election_timeout, rec);
2014
2015         ret = send_election_request(rec, pnn);
2016         if (ret!=0) {
2017                 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2018                 return;
2019         }
2020
2021         /* wait for a few seconds to collect all responses */
2022         ctdb_wait_election(rec);
2023 }
2024
2025
2026
2027 /*
2028   handler for when a node changes its flags
2029 */
2030 static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2031 {
2032         struct ctdb_recoverd *rec = talloc_get_type(
2033                 private_data, struct ctdb_recoverd);
2034         struct ctdb_context *ctdb = rec->ctdb;
2035         int ret;
2036         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2037         struct ctdb_node_map_old *nodemap=NULL;
2038         TALLOC_CTX *tmp_ctx;
2039         int i;
2040
2041         if (data.dsize != sizeof(*c)) {
2042                 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2043                 return;
2044         }
2045
2046         tmp_ctx = talloc_new(ctdb);
2047         CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2048
2049         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2050         if (ret != 0) {
2051                 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2052                 talloc_free(tmp_ctx);
2053                 return;
2054         }
2055
2056
2057         for (i=0;i<nodemap->num;i++) {
2058                 if (nodemap->nodes[i].pnn == c->pnn) break;
2059         }
2060
2061         if (i == nodemap->num) {
2062                 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2063                 talloc_free(tmp_ctx);
2064                 return;
2065         }
2066
2067         if (c->old_flags != c->new_flags) {
2068                 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2069         }
2070
2071         nodemap->nodes[i].flags = c->new_flags;
2072
2073         talloc_free(tmp_ctx);
2074 }
2075
2076 /*
2077   handler for when we need to push out flag changes ot all other nodes
2078 */
2079 static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2080                                void *private_data)
2081 {
2082         struct ctdb_recoverd *rec = talloc_get_type(
2083                 private_data, struct ctdb_recoverd);
2084         struct ctdb_context *ctdb = rec->ctdb;
2085         int ret;
2086         struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2087         struct ctdb_node_map_old *nodemap=NULL;
2088         TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2089         uint32_t *nodes;
2090
2091         /* read the node flags from the recmaster */
2092         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2093                                    tmp_ctx, &nodemap);
2094         if (ret != 0) {
2095                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2096                 talloc_free(tmp_ctx);
2097                 return;
2098         }
2099         if (c->pnn >= nodemap->num) {
2100                 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2101                 talloc_free(tmp_ctx);
2102                 return;
2103         }
2104
2105         /* send the flags update to all connected nodes */
2106         nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2107
2108         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2109                                       nodes, 0, CONTROL_TIMEOUT(),
2110                                       false, data,
2111                                       NULL, NULL,
2112                                       NULL) != 0) {
2113                 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2114
2115                 talloc_free(tmp_ctx);
2116                 return;
2117         }
2118
2119         talloc_free(tmp_ctx);
2120 }
2121
2122
2123 struct verify_recmode_normal_data {
2124         uint32_t count;
2125         enum monitor_result status;
2126 };
2127
2128 static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2129 {
2130         struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2131
2132
2133         /* one more node has responded with recmode data*/
2134         rmdata->count--;
2135
2136         /* if we failed to get the recmode, then return an error and let
2137            the main loop try again.
2138         */
2139         if (state->state != CTDB_CONTROL_DONE) {
2140                 if (rmdata->status == MONITOR_OK) {
2141                         rmdata->status = MONITOR_FAILED;
2142                 }
2143                 return;
2144         }
2145
2146         /* if we got a response, then the recmode will be stored in the
2147            status field
2148         */
2149         if (state->status != CTDB_RECOVERY_NORMAL) {
2150                 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2151                 rmdata->status = MONITOR_RECOVERY_NEEDED;
2152         }
2153
2154         return;
2155 }
2156
2157
2158 /* verify that all nodes are in normal recovery mode */
2159 static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2160 {
2161         struct verify_recmode_normal_data *rmdata;
2162         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2163         struct ctdb_client_control_state *state;
2164         enum monitor_result status;
2165         int j;
2166
2167         rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2168         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2169         rmdata->count  = 0;
2170         rmdata->status = MONITOR_OK;
2171
2172         /* loop over all active nodes and send an async getrecmode call to
2173            them*/
2174         for (j=0; j<nodemap->num; j++) {
2175                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2176                         continue;
2177                 }
2178                 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2179                                         CONTROL_TIMEOUT(),
2180                                         nodemap->nodes[j].pnn);
2181                 if (state == NULL) {
2182                         /* we failed to send the control, treat this as
2183                            an error and try again next iteration
2184                         */
2185                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2186                         talloc_free(mem_ctx);
2187                         return MONITOR_FAILED;
2188                 }
2189
2190                 /* set up the callback functions */
2191                 state->async.fn = verify_recmode_normal_callback;
2192                 state->async.private_data = rmdata;
2193
2194                 /* one more control to wait for to complete */
2195                 rmdata->count++;
2196         }
2197
2198
2199         /* now wait for up to the maximum number of seconds allowed
2200            or until all nodes we expect a response from has replied
2201         */
2202         while (rmdata->count > 0) {
2203                 tevent_loop_once(ctdb->ev);
2204         }
2205
2206         status = rmdata->status;
2207         talloc_free(mem_ctx);
2208         return status;
2209 }
2210
2211
2212 struct verify_recmaster_data {
2213         struct ctdb_recoverd *rec;
2214         uint32_t count;
2215         uint32_t pnn;
2216         enum monitor_result status;
2217 };
2218
2219 static void verify_recmaster_callback(struct ctdb_client_control_state *state)
2220 {
2221         struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
2222
2223
2224         /* one more node has responded with recmaster data*/
2225         rmdata->count--;
2226
2227         /* if we failed to get the recmaster, then return an error and let
2228            the main loop try again.
2229         */
2230         if (state->state != CTDB_CONTROL_DONE) {
2231                 if (rmdata->status == MONITOR_OK) {
2232                         rmdata->status = MONITOR_FAILED;
2233                 }
2234                 return;
2235         }
2236
2237         /* if we got a response, then the recmaster will be stored in the
2238            status field
2239         */
2240         if (state->status != rmdata->pnn) {
2241                 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
2242                 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
2243                 rmdata->status = MONITOR_ELECTION_NEEDED;
2244         }
2245
2246         return;
2247 }
2248
2249
2250 /* verify that all nodes agree that we are the recmaster */
2251 static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
2252 {
2253         struct ctdb_context *ctdb = rec->ctdb;
2254         struct verify_recmaster_data *rmdata;
2255         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2256         struct ctdb_client_control_state *state;
2257         enum monitor_result status;
2258         int j;
2259
2260         rmdata = talloc(mem_ctx, struct verify_recmaster_data);
2261         CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2262         rmdata->rec    = rec;
2263         rmdata->count  = 0;
2264         rmdata->pnn    = pnn;
2265         rmdata->status = MONITOR_OK;
2266
2267         /* loop over all active nodes and send an async getrecmaster call to
2268            them*/
2269         for (j=0; j<nodemap->num; j++) {
2270                 if (nodemap->nodes[j].pnn == rec->recmaster) {
2271                         continue;
2272                 }
2273                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2274                         continue;
2275                 }
2276                 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
2277                                         CONTROL_TIMEOUT(),
2278                                         nodemap->nodes[j].pnn);
2279                 if (state == NULL) {
2280                         /* we failed to send the control, treat this as
2281                            an error and try again next iteration
2282                         */
2283                         DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
2284                         talloc_free(mem_ctx);
2285                         return MONITOR_FAILED;
2286                 }
2287
2288                 /* set up the callback functions */
2289                 state->async.fn = verify_recmaster_callback;
2290                 state->async.private_data = rmdata;
2291
2292                 /* one more control to wait for to complete */
2293                 rmdata->count++;
2294         }
2295
2296
2297         /* now wait for up to the maximum number of seconds allowed
2298            or until all nodes we expect a response from has replied
2299         */
2300         while (rmdata->count > 0) {
2301                 tevent_loop_once(ctdb->ev);
2302         }
2303
2304         status = rmdata->status;
2305         talloc_free(mem_ctx);
2306         return status;
2307 }
2308
2309 static bool interfaces_have_changed(struct ctdb_context *ctdb,
2310                                     struct ctdb_recoverd *rec)
2311 {
2312         struct ctdb_iface_list_old *ifaces = NULL;
2313         TALLOC_CTX *mem_ctx;
2314         bool ret = false;
2315
2316         mem_ctx = talloc_new(NULL);
2317
2318         /* Read the interfaces from the local node */
2319         if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
2320                                  CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
2321                 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
2322                 /* We could return an error.  However, this will be
2323                  * rare so we'll decide that the interfaces have
2324                  * actually changed, just in case.
2325                  */
2326                 talloc_free(mem_ctx);
2327                 return true;
2328         }
2329
2330         if (!rec->ifaces) {
2331                 /* We haven't been here before so things have changed */
2332                 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
2333                 ret = true;
2334         } else if (rec->ifaces->num != ifaces->num) {
2335                 /* Number of interfaces has changed */
2336                 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
2337                                      rec->ifaces->num, ifaces->num));
2338                 ret = true;
2339         } else {
2340                 /* See if interface names or link states have changed */
2341                 int i;
2342                 for (i = 0; i < rec->ifaces->num; i++) {
2343                         struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
2344                         if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
2345                                 DEBUG(DEBUG_NOTICE,
2346                                       ("Interface in slot %d changed: %s => %s\n",
2347                                        i, iface->name, ifaces->ifaces[i].name));
2348                                 ret = true;
2349                                 break;
2350                         }
2351                         if (iface->link_state != ifaces->ifaces[i].link_state) {
2352                                 DEBUG(DEBUG_NOTICE,
2353                                       ("Interface %s changed state: %d => %d\n",
2354                                        iface->name, iface->link_state,
2355                                        ifaces->ifaces[i].link_state));
2356                                 ret = true;
2357                                 break;
2358                         }
2359                 }
2360         }
2361
2362         talloc_free(rec->ifaces);
2363         rec->ifaces = talloc_steal(rec, ifaces);
2364
2365         talloc_free(mem_ctx);
2366         return ret;
2367 }
2368
2369 /* Check that the local allocation of public IP addresses is correct
2370  * and do some house-keeping */
2371 static int verify_local_ip_allocation(struct ctdb_context *ctdb,
2372                                       struct ctdb_recoverd *rec,
2373                                       uint32_t pnn,
2374                                       struct ctdb_node_map_old *nodemap)
2375 {
2376         TALLOC_CTX *mem_ctx = talloc_new(NULL);
2377         int ret, j;
2378         bool need_takeover_run = false;
2379         struct ctdb_public_ip_list_old *ips = NULL;
2380
2381         /* If we are not the recmaster then do some housekeeping */
2382         if (rec->recmaster != pnn) {
2383                 /* Ignore any IP reallocate requests - only recmaster
2384                  * processes them
2385                  */
2386                 TALLOC_FREE(rec->reallocate_requests);
2387                 /* Clear any nodes that should be force rebalanced in
2388                  * the next takeover run.  If the recovery master role
2389                  * has moved then we don't want to process these some
2390                  * time in the future.
2391                  */
2392                 TALLOC_FREE(rec->force_rebalance_nodes);
2393         }
2394
2395         /* Return early if disabled... */
2396         if (ctdb_config.failover_disabled ||
2397             ctdb_op_is_disabled(rec->takeover_run)) {
2398                 return  0;
2399         }
2400
2401         if (interfaces_have_changed(ctdb, rec)) {
2402                 need_takeover_run = true;
2403         }
2404
2405         /* If there are unhosted IPs but this node can host them then
2406          * trigger an IP reallocation */
2407
2408         /* Read *available* IPs from local node */
2409         ret = ctdb_ctrl_get_public_ips_flags(
2410                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx,
2411                 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
2412         if (ret != 0) {
2413                 DEBUG(DEBUG_ERR, ("Unable to retrieve available public IPs\n"));
2414                 talloc_free(mem_ctx);
2415                 return -1;
2416         }
2417
2418         for (j=0; j<ips->num; j++) {
2419                 if (ips->ips[j].pnn == -1 &&
2420                     nodemap->nodes[pnn].flags == 0) {
2421                         DEBUG(DEBUG_WARNING,
2422                               ("Unassigned IP %s can be served by this node\n",
2423                                ctdb_addr_to_str(&ips->ips[j].addr)));
2424                         need_takeover_run = true;
2425                 }
2426         }
2427
2428         talloc_free(ips);
2429
2430         if (!ctdb->do_checkpublicip) {
2431                 goto done;
2432         }
2433
2434         /* Validate the IP addresses that this node has on network
2435          * interfaces.  If there is an inconsistency between reality
2436          * and the state expected by CTDB then try to fix it by
2437          * triggering an IP reallocation or releasing extraneous IP
2438          * addresses. */
2439
2440         /* Read *known* IPs from local node */
2441         ret = ctdb_ctrl_get_public_ips_flags(
2442                 ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
2443         if (ret != 0) {
2444                 DEBUG(DEBUG_ERR, ("Unable to retrieve known public IPs\n"));
2445                 talloc_free(mem_ctx);
2446                 return -1;
2447         }
2448
2449         for (j=0; j<ips->num; j++) {
2450                 if (ips->ips[j].pnn == pnn) {
2451                         if (!ctdb_sys_have_ip(&ips->ips[j].addr)) {
2452                                 DEBUG(DEBUG_ERR,
2453                                       ("Assigned IP %s not on an interface\n",
2454                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2455                                 need_takeover_run = true;
2456                         }
2457                 } else {
2458                         if (ctdb_sys_have_ip(&ips->ips[j].addr)) {
2459                                 DEBUG(DEBUG_ERR,
2460                                       ("IP %s incorrectly on an interface\n",
2461                                        ctdb_addr_to_str(&ips->ips[j].addr)));
2462                                 need_takeover_run = true;
2463                         }
2464                 }
2465         }
2466
2467 done:
2468         if (need_takeover_run) {
2469                 struct ctdb_srvid_message rd;
2470                 TDB_DATA data;
2471
2472                 DEBUG(DEBUG_NOTICE,("Trigger takeoverrun\n"));
2473
2474                 ZERO_STRUCT(rd);
2475                 rd.pnn = ctdb->pnn;
2476                 rd.srvid = 0;
2477                 data.dptr = (uint8_t *)&rd;
2478                 data.dsize = sizeof(rd);
2479
2480                 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
2481                 if (ret != 0) {
2482                         DEBUG(DEBUG_ERR,
2483                               ("Failed to send takeover run request\n"));
2484                 }
2485         }
2486         talloc_free(mem_ctx);
2487         return 0;
2488 }
2489
2490
2491 static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
2492 {
2493         struct ctdb_node_map_old **remote_nodemaps = callback_data;
2494
2495         if (node_pnn >= ctdb->num_nodes) {
2496                 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
2497                 return;
2498         }
2499
2500         remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
2501
2502 }
2503
2504 static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
2505         struct ctdb_node_map_old *nodemap,
2506         struct ctdb_node_map_old **remote_nodemaps)
2507 {
2508         uint32_t *nodes;
2509
2510         nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
2511         if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
2512                                         nodes, 0,
2513                                         CONTROL_TIMEOUT(), false, tdb_null,
2514                                         async_getnodemap_callback,
2515                                         NULL,
2516                                         remote_nodemaps) != 0) {
2517                 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
2518
2519                 return -1;
2520         }
2521
2522         return 0;
2523 }
2524
2525 static bool validate_recovery_master(struct ctdb_recoverd *rec,
2526                                      TALLOC_CTX *mem_ctx)
2527 {
2528         struct ctdb_context *ctdb = rec->ctdb;
2529         uint32_t pnn = ctdb_get_pnn(ctdb);
2530         struct ctdb_node_map_old *nodemap = rec->nodemap;
2531         struct ctdb_node_map_old *recmaster_nodemap = NULL;
2532         int ret;
2533
2534         /* When recovery daemon is started, recmaster is set to
2535          * "unknown" so it knows to start an election.
2536          */
2537         if (rec->recmaster == CTDB_UNKNOWN_PNN) {
2538                 DEBUG(DEBUG_NOTICE,
2539                       ("Initial recovery master set - forcing election\n"));
2540                 force_election(rec, pnn, nodemap);
2541                 return false;
2542         }
2543
2544         /*
2545          * If the current recmaster does not have CTDB_CAP_RECMASTER,
2546          * but we have, then force an election and try to become the new
2547          * recmaster.
2548          */
2549         if (!ctdb_node_has_capabilities(rec->caps,
2550                                         rec->recmaster,
2551                                         CTDB_CAP_RECMASTER) &&
2552             (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
2553             !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
2554                 DEBUG(DEBUG_ERR,
2555                       (" Current recmaster node %u does not have CAP_RECMASTER,"
2556                        " but we (node %u) have - force an election\n",
2557                        rec->recmaster, pnn));
2558                 force_election(rec, pnn, nodemap);
2559                 return false;
2560         }
2561
2562         /* Verify that the master node has not been deleted.  This
2563          * should not happen because a node should always be shutdown
2564          * before being deleted, causing a new master to be elected
2565          * before now.  However, if something strange has happened
2566          * then checking here will ensure we don't index beyond the
2567          * end of the nodemap array. */
2568         if (rec->recmaster >= nodemap->num) {
2569                 DEBUG(DEBUG_ERR,
2570                       ("Recmaster node %u has been deleted. Force election\n",
2571                        rec->recmaster));
2572                 force_election(rec, pnn, nodemap);
2573                 return false;
2574         }
2575
2576         /* if recovery master is disconnected/deleted we must elect a new recmaster */
2577         if (nodemap->nodes[rec->recmaster].flags &
2578             (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
2579                 DEBUG(DEBUG_NOTICE,
2580                       ("Recmaster node %u is disconnected/deleted. Force election\n",
2581                        rec->recmaster));
2582                 force_election(rec, pnn, nodemap);
2583                 return false;
2584         }
2585
2586         /* get nodemap from the recovery master to check if it is inactive */
2587         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2588                                    mem_ctx, &recmaster_nodemap);
2589         if (ret != 0) {
2590                 DEBUG(DEBUG_ERR,
2591                       (__location__
2592                        " Unable to get nodemap from recovery master %u\n",
2593                           rec->recmaster));
2594                 /* No election, just error */
2595                 return false;
2596         }
2597
2598
2599         if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
2600             (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
2601                 DEBUG(DEBUG_NOTICE,
2602                       ("Recmaster node %u is inactive. Force election\n",
2603                        rec->recmaster));
2604                 /*
2605                  * update our nodemap to carry the recmaster's notion of
2606                  * its own flags, so that we don't keep freezing the
2607                  * inactive recmaster node...
2608                  */
2609                 nodemap->nodes[rec->recmaster].flags =
2610                         recmaster_nodemap->nodes[rec->recmaster].flags;
2611                 force_election(rec, pnn, nodemap);
2612                 return false;
2613         }
2614
2615         return true;
2616 }
2617
2618 static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
2619                       TALLOC_CTX *mem_ctx)
2620 {
2621         uint32_t pnn;
2622         struct ctdb_node_map_old *nodemap=NULL;
2623         struct ctdb_node_map_old **remote_nodemaps=NULL;
2624         struct ctdb_vnn_map *vnnmap=NULL;
2625         struct ctdb_vnn_map *remote_vnnmap=NULL;
2626         uint32_t num_lmasters;
2627         int32_t debug_level;
2628         int i, j, ret;
2629         bool self_ban;
2630
2631
2632         /* verify that the main daemon is still running */
2633         if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
2634                 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
2635                 exit(-1);
2636         }
2637
2638         /* ping the local daemon to tell it we are alive */
2639         ctdb_ctrl_recd_ping(ctdb);
2640
2641         if (rec->election_timeout) {
2642                 /* an election is in progress */
2643                 return;
2644         }
2645
2646         /* read the debug level from the parent and update locally */
2647         ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
2648         if (ret !=0) {
2649                 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
2650                 return;
2651         }
2652         DEBUGLEVEL = debug_level;
2653
2654         /* get relevant tunables */
2655         ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
2656         if (ret != 0) {
2657                 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
2658                 return;
2659         }
2660
2661         /* get runstate */
2662         ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
2663                                      CTDB_CURRENT_NODE, &ctdb->runstate);
2664         if (ret != 0) {
2665                 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
2666                 return;
2667         }
2668
2669         pnn = ctdb_get_pnn(ctdb);
2670
2671         /* get nodemap */
2672         TALLOC_FREE(rec->nodemap);
2673         ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
2674         if (ret != 0) {
2675                 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
2676                 return;
2677         }
2678         nodemap = rec->nodemap;
2679
2680         /* remember our own node flags */
2681         rec->node_flags = nodemap->nodes[pnn].flags;
2682
2683         ban_misbehaving_nodes(rec, &self_ban);
2684         if (self_ban) {
2685                 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
2686                 return;
2687         }
2688
2689         ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(),
2690                                    CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2691         if (ret != 0) {
2692                 D_ERR("Failed to read recmode from local node\n");
2693                 return;
2694         }
2695
2696         /* if the local daemon is STOPPED or BANNED, we verify that the databases are
2697            also frozen and that the recmode is set to active.
2698         */
2699         if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
2700                 /* If this node has become inactive then we want to
2701                  * reduce the chances of it taking over the recovery
2702                  * master role when it becomes active again.  This
2703                  * helps to stabilise the recovery master role so that
2704                  * it stays on the most stable node.
2705                  */
2706                 rec->priority_time = timeval_current();
2707
2708                 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2709                         DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
2710
2711                         ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
2712                         if (ret != 0) {
2713                                 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
2714
2715                                 return;
2716                         }
2717                 }
2718                 if (! rec->frozen_on_inactive) {
2719                         ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
2720                                                CTDB_CURRENT_NODE);
2721                         if (ret != 0) {
2722                                 DEBUG(DEBUG_ERR,
2723                                       (__location__ " Failed to freeze node "
2724                                        "in STOPPED or BANNED state\n"));
2725                                 return;
2726                         }
2727
2728                         rec->frozen_on_inactive = true;
2729                 }
2730
2731                 /* If this node is stopped or banned then it is not the recovery
2732                  * master, so don't do anything. This prevents stopped or banned
2733                  * node from starting election and sending unnecessary controls.
2734                  */
2735                 return;
2736         }
2737
2738         rec->frozen_on_inactive = false;
2739
2740         /* Retrieve capabilities from all connected nodes */
2741         ret = update_capabilities(rec, nodemap);
2742         if (ret != 0) {
2743                 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2744                 return;
2745         }
2746
2747         if (! validate_recovery_master(rec, mem_ctx)) {
2748                 return;
2749         }
2750
2751         if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2752                 /* Check if an IP takeover run is needed and trigger one if
2753                  * necessary */
2754                 verify_local_ip_allocation(ctdb, rec, pnn, nodemap);
2755         }
2756
2757         /* if we are not the recmaster then we do not need to check
2758            if recovery is needed
2759          */
2760         if (pnn != rec->recmaster) {
2761                 return;
2762         }
2763
2764
2765         /* ensure our local copies of flags are right */
2766         ret = update_local_flags(rec, nodemap);
2767         if (ret != 0) {
2768                 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
2769                 return;
2770         }
2771
2772         if (ctdb->num_nodes != nodemap->num) {
2773                 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
2774                 ctdb_load_nodes_file(ctdb);
2775                 return;
2776         }
2777
2778         /* verify that all active nodes agree that we are the recmaster */
2779         switch (verify_recmaster(rec, nodemap, pnn)) {
2780         case MONITOR_RECOVERY_NEEDED:
2781                 /* can not happen */
2782                 return;
2783         case MONITOR_ELECTION_NEEDED:
2784                 force_election(rec, pnn, nodemap);
2785                 return;
2786         case MONITOR_OK:
2787                 break;
2788         case MONITOR_FAILED:
2789                 return;
2790         }
2791
2792
2793         /* get the vnnmap */
2794         ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
2795         if (ret != 0) {
2796                 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
2797                 return;
2798         }
2799
2800         if (rec->need_recovery) {
2801                 /* a previous recovery didn't finish */
2802                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2803                 return;
2804         }
2805
2806         /* verify that all active nodes are in normal mode
2807            and not in recovery mode
2808         */
2809         switch (verify_recmode(ctdb, nodemap)) {
2810         case MONITOR_RECOVERY_NEEDED:
2811                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2812                 return;
2813         case MONITOR_FAILED:
2814                 return;
2815         case MONITOR_ELECTION_NEEDED:
2816                 /* can not happen */
2817         case MONITOR_OK:
2818                 break;
2819         }
2820
2821
2822         if (ctdb->recovery_lock != NULL) {
2823                 /* We must already hold the recovery lock */
2824                 if (!ctdb_recovery_have_lock(rec)) {
2825                         DEBUG(DEBUG_ERR,("Failed recovery lock sanity check.  Force a recovery\n"));
2826                         ctdb_set_culprit(rec, ctdb->pnn);
2827                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2828                         return;
2829                 }
2830         }
2831
2832
2833         /* If recoveries are disabled then there is no use doing any
2834          * nodemap or flags checks.  Recoveries might be disabled due
2835          * to "reloadnodes", so doing these checks might cause an
2836          * unnecessary recovery.  */
2837         if (ctdb_op_is_disabled(rec->recovery)) {
2838                 goto takeover_run_checks;
2839         }
2840
2841         /* get the nodemap for all active remote nodes
2842          */
2843         remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
2844         if (remote_nodemaps == NULL) {
2845                 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
2846                 return;
2847         }
2848         for(i=0; i<nodemap->num; i++) {
2849                 remote_nodemaps[i] = NULL;
2850         }
2851         if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
2852                 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
2853                 return;
2854         }
2855
2856         /* verify that all other nodes have the same nodemap as we have
2857         */
2858         for (j=0; j<nodemap->num; j++) {
2859                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2860                         continue;
2861                 }
2862
2863                 if (remote_nodemaps[j] == NULL) {
2864                         DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
2865                         ctdb_set_culprit(rec, j);
2866
2867                         return;
2868                 }
2869
2870                 /* if the nodes disagree on how many nodes there are
2871                    then this is a good reason to try recovery
2872                  */
2873                 if (remote_nodemaps[j]->num != nodemap->num) {
2874                         DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
2875                                   nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
2876                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2877                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2878                         return;
2879                 }
2880
2881                 /* if the nodes disagree on which nodes exist and are
2882                    active, then that is also a good reason to do recovery
2883                  */
2884                 for (i=0;i<nodemap->num;i++) {
2885                         if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
2886                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
2887                                           nodemap->nodes[j].pnn, i,
2888                                           remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
2889                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2890                                 do_recovery(rec, mem_ctx, pnn, nodemap,
2891                                             vnnmap);
2892                                 return;
2893                         }
2894                 }
2895         }
2896
2897         /*
2898          * Update node flags obtained from each active node. This ensure we have
2899          * up-to-date information for all the nodes.
2900          */
2901         for (j=0; j<nodemap->num; j++) {
2902                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2903                         continue;
2904                 }
2905                 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
2906         }
2907
2908         for (j=0; j<nodemap->num; j++) {
2909                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2910                         continue;
2911                 }
2912
2913                 /* verify the flags are consistent
2914                 */
2915                 for (i=0; i<nodemap->num; i++) {
2916                         if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2917                                 continue;
2918                         }
2919
2920                         if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
2921                                 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
2922                                   nodemap->nodes[j].pnn,
2923                                   nodemap->nodes[i].pnn,
2924                                   remote_nodemaps[j]->nodes[i].flags,
2925                                   nodemap->nodes[i].flags));
2926                                 if (i == j) {
2927                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
2928                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
2929                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2930                                         do_recovery(rec, mem_ctx, pnn, nodemap,
2931                                                     vnnmap);
2932                                         return;
2933                                 } else {
2934                                         DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
2935                                         update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
2936                                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2937                                         do_recovery(rec, mem_ctx, pnn, nodemap,
2938                                                     vnnmap);
2939                                         return;
2940                                 }
2941                         }
2942                 }
2943         }
2944
2945
2946         /* count how many active nodes there are */
2947         num_lmasters  = 0;
2948         for (i=0; i<nodemap->num; i++) {
2949                 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
2950                         if (ctdb_node_has_capabilities(rec->caps,
2951                                                        ctdb->nodes[i]->pnn,
2952                                                        CTDB_CAP_LMASTER)) {
2953                                 num_lmasters++;
2954                         }
2955                 }
2956         }
2957
2958
2959         /* There must be the same number of lmasters in the vnn map as
2960          * there are active nodes with the lmaster capability...  or
2961          * do a recovery.
2962          */
2963         if (vnnmap->size != num_lmasters) {
2964                 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
2965                           vnnmap->size, num_lmasters));
2966                 ctdb_set_culprit(rec, ctdb->pnn);
2967                 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2968                 return;
2969         }
2970
2971         /* verify that all active nodes in the nodemap also exist in
2972            the vnnmap.
2973          */
2974         for (j=0; j<nodemap->num; j++) {
2975                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2976                         continue;
2977                 }
2978                 if (nodemap->nodes[j].pnn == pnn) {
2979                         continue;
2980                 }
2981
2982                 for (i=0; i<vnnmap->size; i++) {
2983                         if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
2984                                 break;
2985                         }
2986                 }
2987                 if (i == vnnmap->size) {
2988                         DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
2989                                   nodemap->nodes[j].pnn));
2990                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
2991                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
2992                         return;
2993                 }
2994         }
2995
2996
2997         /* verify that all other nodes have the same vnnmap
2998            and are from the same generation
2999          */
3000         for (j=0; j<nodemap->num; j++) {
3001                 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3002                         continue;
3003                 }
3004                 if (nodemap->nodes[j].pnn == pnn) {
3005                         continue;
3006                 }
3007
3008                 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3009                                           mem_ctx, &remote_vnnmap);
3010                 if (ret != 0) {
3011                         DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3012                                   nodemap->nodes[j].pnn));
3013                         return;
3014                 }
3015
3016                 /* verify the vnnmap generation is the same */
3017                 if (vnnmap->generation != remote_vnnmap->generation) {
3018                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3019                                   nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3020                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3021                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3022                         return;
3023                 }
3024
3025                 /* verify the vnnmap size is the same */
3026                 if (vnnmap->size != remote_vnnmap->size) {
3027                         DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3028                                   nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3029                         ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3030                         do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3031                         return;
3032                 }
3033
3034                 /* verify the vnnmap is the same */
3035                 for (i=0;i<vnnmap->size;i++) {
3036                         if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3037                                 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3038                                           nodemap->nodes[j].pnn));
3039                                 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3040                                 do_recovery(rec, mem_ctx, pnn, nodemap,
3041                                             vnnmap);
3042                                 return;
3043                         }
3044                 }
3045         }
3046
3047         /* FIXME: Add remote public IP checking to ensure that nodes
3048          * have the IP addresses that are allocated to them. */
3049
3050 takeover_run_checks:
3051
3052         /* If there are IP takeover runs requested or the previous one
3053          * failed then perform one and notify the waiters */
3054         if (!ctdb_op_is_disabled(rec->takeover_run) &&
3055             (rec->reallocate_requests || rec->need_takeover_run)) {
3056                 process_ipreallocate_requests(ctdb, rec);
3057         }
3058 }
3059
3060 static void recd_sig_term_handler(struct tevent_context *ev,
3061                                   struct tevent_signal *se, int signum,
3062                                   int count, void *dont_care,
3063                                   void *private_data)
3064 {
3065         struct ctdb_recoverd *rec = talloc_get_type_abort(
3066                 private_data, struct ctdb_recoverd);
3067
3068         DEBUG(DEBUG_ERR, ("Received SIGTERM, exiting\n"));
3069         ctdb_recovery_unlock(rec);
3070         exit(0);
3071 }
3072
3073
3074 /*
3075   the main monitoring loop
3076  */
3077 static void monitor_cluster(struct ctdb_context *ctdb)
3078 {
3079         struct tevent_signal *se;
3080         struct ctdb_recoverd *rec;
3081
3082         DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3083
3084         rec = talloc_zero(ctdb, struct ctdb_recoverd);
3085         CTDB_NO_MEMORY_FATAL(ctdb, rec);
3086
3087         rec->ctdb = ctdb;
3088         rec->recmaster = CTDB_UNKNOWN_PNN;
3089         rec->recovery_lock_handle = NULL;
3090
3091         rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3092         CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3093
3094         rec->recovery = ctdb_op_init(rec, "recoveries");
3095         CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3096
3097         rec->priority_time = timeval_current();
3098         rec->frozen_on_inactive = false;
3099
3100         se = tevent_add_signal(ctdb->ev, ctdb, SIGTERM, 0,
3101                                recd_sig_term_handler, rec);
3102         if (se == NULL) {
3103                 DEBUG(DEBUG_ERR, ("Failed to install SIGTERM handler\n"));
3104                 exit(1);
3105         }
3106
3107         /* register a message port for sending memory dumps */
3108         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3109
3110         /* when a node is assigned banning credits */
3111         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3112                                         banning_handler, rec);
3113
3114         /* register a message port for recovery elections */
3115         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3116
3117         /* when nodes are disabled/enabled */
3118         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3119
3120         /* when we are asked to puch out a flag change */
3121         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3122
3123         /* register a message port for vacuum fetch */
3124         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3125
3126         /* register a message port for reloadnodes  */
3127         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3128
3129         /* register a message port for performing a takeover run */
3130         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3131
3132         /* register a message port for disabling the ip check for a short while */
3133         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3134
3135         /* register a message port for forcing a rebalance of a node next
3136            reallocation */
3137         ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3138
3139         /* Register a message port for disabling takeover runs */
3140         ctdb_client_set_message_handler(ctdb,
3141                                         CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3142                                         disable_takeover_runs_handler, rec);
3143
3144         /* Register a message port for disabling recoveries */
3145         ctdb_client_set_message_handler(ctdb,
3146                                         CTDB_SRVID_DISABLE_RECOVERIES,
3147                                         disable_recoveries_handler, rec);
3148
3149         /* register a message port for detaching database */
3150         ctdb_client_set_message_handler(ctdb,
3151                                         CTDB_SRVID_DETACH_DATABASE,
3152                                         detach_database_handler, rec);
3153
3154         for (;;) {
3155                 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3156                 struct timeval start;
3157                 double elapsed;
3158
3159                 if (!mem_ctx) {
3160                         DEBUG(DEBUG_CRIT,(__location__
3161                                           " Failed to create temp context\n"));
3162                         exit(-1);
3163                 }
3164
3165                 start = timeval_current();
3166                 main_loop(ctdb, rec, mem_ctx);
3167                 talloc_free(mem_ctx);
3168
3169                 /* we only check for recovery once every second */
3170                 elapsed = timeval_elapsed(&start);
3171                 if (elapsed < ctdb->tunable.recover_interval) {
3172                         ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3173                                           - elapsed);
3174                 }
3175         }
3176 }
3177
3178 /*
3179   event handler for when the main ctdbd dies
3180  */
3181 static void ctdb_recoverd_parent(struct tevent_context *ev,
3182                                  struct tevent_fd *fde,
3183                                  uint16_t flags, void *private_data)
3184 {
3185         DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
3186         _exit(1);
3187 }
3188
3189 /*
3190   called regularly to verify that the recovery daemon is still running
3191  */
3192 static void ctdb_check_recd(struct tevent_context *ev,
3193                             struct tevent_timer *te,
3194                             struct timeval yt, void *p)
3195 {
3196         struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
3197
3198         if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
3199                 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
3200
3201                 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
3202                                  ctdb_restart_recd, ctdb);
3203
3204                 return;
3205         }
3206
3207         tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3208                          timeval_current_ofs(30, 0),
3209                          ctdb_check_recd, ctdb);
3210 }
3211
3212 static void recd_sig_child_handler(struct tevent_context *ev,
3213                                    struct tevent_signal *se, int signum,
3214                                    int count, void *dont_care,
3215                                    void *private_data)
3216 {
3217 //      struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3218         int status;
3219         pid_t pid = -1;
3220
3221         while (pid != 0) {
3222                 pid = waitpid(-1, &status, WNOHANG);
3223                 if (pid == -1) {
3224                         if (errno != ECHILD) {
3225                                 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
3226                         }
3227                         return;
3228                 }
3229                 if (pid > 0) {
3230                         DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
3231                 }
3232         }
3233 }
3234
3235 /*
3236   startup the recovery daemon as a child of the main ctdb daemon
3237  */
3238 int ctdb_start_recoverd(struct ctdb_context *ctdb)
3239 {
3240         int fd[2];
3241         struct tevent_signal *se;
3242         struct tevent_fd *fde;
3243         int ret;
3244
3245         if (pipe(fd) != 0) {
3246                 return -1;
3247         }
3248
3249         ctdb->recoverd_pid = ctdb_fork(ctdb);
3250         if (ctdb->recoverd_pid == -1) {
3251                 return -1;
3252         }
3253
3254         if (ctdb->recoverd_pid != 0) {
3255                 talloc_free(ctdb->recd_ctx);
3256                 ctdb->recd_ctx = talloc_new(ctdb);
3257                 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
3258
3259                 close(fd[0]);
3260                 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
3261                                  timeval_current_ofs(30, 0),
3262                                  ctdb_check_recd, ctdb);
3263                 return 0;
3264         }
3265
3266         close(fd[1]);
3267
3268         srandom(getpid() ^ time(NULL));
3269
3270         ret = logging_init(ctdb, NULL, NULL, "ctdb-recoverd");
3271         if (ret != 0) {
3272                 return -1;
3273         }
3274
3275         prctl_set_comment("ctdb_recoverd");
3276         if (switch_from_server_to_client(ctdb) != 0) {
3277                 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
3278                 exit(1);
3279         }
3280
3281         DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
3282
3283         fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
3284                             ctdb_recoverd_parent, &fd[0]);
3285         tevent_fd_set_auto_close(fde);
3286
3287         /* set up a handler to pick up sigchld */
3288         se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
3289                                recd_sig_child_handler, ctdb);
3290         if (se == NULL) {
3291                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
3292                 exit(1);
3293         }
3294
3295         monitor_cluster(ctdb);
3296
3297         DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
3298         return -1;
3299 }
3300
3301 /*
3302   shutdown the recovery daemon
3303  */
3304 void ctdb_stop_recoverd(struct ctdb_context *ctdb)
3305 {
3306         if (ctdb->recoverd_pid == 0) {
3307                 return;
3308         }
3309
3310         DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
3311         ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
3312
3313         TALLOC_FREE(ctdb->recd_ctx);
3314         TALLOC_FREE(ctdb->recd_ping_count);
3315 }
3316
3317 static void ctdb_restart_recd(struct tevent_context *ev,
3318                               struct tevent_timer *te,
3319                               struct timeval t, void *private_data)
3320 {
3321         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3322
3323         DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
3324         ctdb_stop_recoverd(ctdb);
3325         ctdb_start_recoverd(ctdb);
3326 }