ctdb/server/ctdb_daemon.c

   1 /*
   2    ctdb daemon code
   3
   4    Copyright (C) Andrew Tridgell  2006
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/network.h"
  22 #include "system/filesys.h"
  23 #include "system/wait.h"
  24 #include "system/time.h"
  25
  26 #include <talloc.h>
  27 /* Allow use of deprecated function tevent_loop_allow_nesting() */
  28 #define TEVENT_DEPRECATED
  29 #include <tevent.h>
  30 #include <tdb.h>
  31
  32 #include "lib/tdb_wrap/tdb_wrap.h"
  33 #include "lib/util/dlinklist.h"
  34 #include "lib/util/debug.h"
  35 #include "lib/util/time.h"
  36 #include "lib/util/blocking.h"
  37 #include "lib/util/become_daemon.h"
  38
  39 #include "version.h"
  40 #include "ctdb_private.h"
  41 #include "ctdb_client.h"
  42
  43 #include "common/rb_tree.h"
  44 #include "common/reqid.h"
  45 #include "common/system.h"
  46 #include "common/common.h"
  47 #include "common/logging.h"
  48 #include "common/pidfile.h"
  49 #include "common/sock_io.h"
  50
  51 struct ctdb_client_pid_list {
  52         struct ctdb_client_pid_list *next, *prev;
  53         struct ctdb_context *ctdb;
  54         pid_t pid;
  55         struct ctdb_client *client;
  56 };
  57
  58 const char *ctdbd_pidfile = NULL;
  59 static struct pidfile_context *ctdbd_pidfile_ctx = NULL;
  60
  61 static void daemon_incoming_packet(void *, struct ctdb_req_header *);
  62
  63 static pid_t __ctdbd_pid;
  64
  65 static void print_exit_message(void)
  66 {
  67         if (getpid() == __ctdbd_pid) {
  68                 DEBUG(DEBUG_NOTICE,("CTDB daemon shutting down\n"));
  69
  70                 /* Wait a second to allow pending log messages to be flushed */
  71                 sleep(1);
  72         }
  73 }
  74
  75 #ifdef HAVE_GETRUSAGE
  76
  77 struct cpu_check_threshold_data {
  78         unsigned short percent;
  79         struct timeval timeofday;
  80         struct timeval ru_time;
  81 };
  82
  83 static void ctdb_cpu_check_threshold(struct tevent_context *ev,
  84                                      struct tevent_timer *te,
  85                                      struct timeval tv,
  86                                      void *private_data)
  87 {
  88         struct ctdb_context *ctdb = talloc_get_type_abort(
  89                 private_data, struct ctdb_context);
  90         uint32_t interval = 60;
  91
  92         static unsigned short threshold = 0;
  93         static struct cpu_check_threshold_data prev = {
  94                 .percent = 0,
  95                 .timeofday = { .tv_sec = 0 },
  96                 .ru_time = { .tv_sec = 0 },
  97         };
  98
  99         struct rusage usage;
 100         struct cpu_check_threshold_data curr = {
 101                 .percent = 0,
 102         };
 103         int64_t ru_time_diff, timeofday_diff;
 104         bool first;
 105         int ret;
 106
 107         /*
 108          * Cache the threshold so that we don't waste time checking
 109          * the environment variable every time
 110          */
 111         if (threshold == 0) {
 112                 const char *t;
 113
 114                 threshold = 90;
 115
 116                 t = getenv("CTDB_TEST_CPU_USAGE_THRESHOLD");
 117                 if (t != NULL) {
 118                         int th;
 119
 120                         th = atoi(t);
 121                         if (th <= 0 || th > 100) {
 122                                 DBG_WARNING("Failed to parse env var: %s\n", t);
 123                         } else {
 124                                 threshold = th;
 125                         }
 126                 }
 127         }
 128
 129         ret = getrusage(RUSAGE_SELF, &usage);
 130         if (ret != 0) {
 131                 DBG_WARNING("rusage() failed: %d\n", ret);
 132                 goto next;
 133         }
 134
 135         /* Sum the system and user CPU usage */
 136         curr.ru_time = timeval_sum(&usage.ru_utime, &usage.ru_stime);
 137
 138         curr.timeofday = tv;
 139
 140         first = timeval_is_zero(&prev.timeofday);
 141         if (first) {
 142                 /* No previous values recorded so no calculation to do */
 143                 goto done;
 144         }
 145
 146         timeofday_diff = usec_time_diff(&curr.timeofday, &prev.timeofday);
 147         if (timeofday_diff <= 0) {
 148                 /*
 149                  * Time went backwards or didn't progress so no (sane)
 150                  * calculation can be done
 151                  */
 152                 goto done;
 153         }
 154
 155         ru_time_diff = usec_time_diff(&curr.ru_time, &prev.ru_time);
 156
 157         curr.percent = ru_time_diff * 100 / timeofday_diff;
 158
 159         if (curr.percent >= threshold) {
 160                 /* Log only if the utilisation changes */
 161                 if (curr.percent != prev.percent) {
 162                         D_WARNING("WARNING: CPU utilisation %hu%% >= "
 163                                   "threshold (%hu%%)\n",
 164                                   curr.percent,
 165                                   threshold);
 166                 }
 167         } else {
 168                 /* Log if the utilisation falls below the threshold */
 169                 if (prev.percent >= threshold) {
 170                         D_WARNING("WARNING: CPU utilisation %hu%% < "
 171                                   "threshold (%hu%%)\n",
 172                                   curr.percent,
 173                                   threshold);
 174                 }
 175         }
 176
 177 done:
 178         prev = curr;
 179
 180 next:
 181         tevent_add_timer(ctdb->ev, ctdb,
 182                          timeval_current_ofs(interval, 0),
 183                          ctdb_cpu_check_threshold,
 184                          ctdb);
 185 }
 186
 187 static void ctdb_start_cpu_check_threshold(struct ctdb_context *ctdb)
 188 {
 189         tevent_add_timer(ctdb->ev, ctdb,
 190                          timeval_current(),
 191                          ctdb_cpu_check_threshold,
 192                          ctdb);
 193 }
 194 #endif /* HAVE_GETRUSAGE */
 195
 196 static void ctdb_time_tick(struct tevent_context *ev, struct tevent_timer *te,
 197                                   struct timeval t, void *private_data)
 198 {
 199         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
 200
 201         if (getpid() != ctdb->ctdbd_pid) {
 202                 return;
 203         }
 204
 205         tevent_add_timer(ctdb->ev, ctdb,
 206                          timeval_current_ofs(1, 0),
 207                          ctdb_time_tick, ctdb);
 208 }
 209
 210 /* Used to trigger a dummy event once per second, to make
 211  * detection of hangs more reliable.
 212  */
 213 static void ctdb_start_time_tickd(struct ctdb_context *ctdb)
 214 {
 215         tevent_add_timer(ctdb->ev, ctdb,
 216                          timeval_current_ofs(1, 0),
 217                          ctdb_time_tick, ctdb);
 218 }
 219
 220 static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
 221 {
 222         /* start monitoring for connected/disconnected nodes */
 223         ctdb_start_keepalive(ctdb);
 224
 225         /* start periodic update of tcp tickle lists */
 226         ctdb_start_tcp_tickle_update(ctdb);
 227
 228         /* start listening for recovery daemon pings */
 229         ctdb_control_recd_ping(ctdb);
 230
 231         /* start listening to timer ticks */
 232         ctdb_start_time_tickd(ctdb);
 233
 234 #ifdef HAVE_GETRUSAGE
 235         ctdb_start_cpu_check_threshold(ctdb);
 236 #endif /* HAVE_GETRUSAGE */
 237 }
 238
 239 static void ignore_signal(int signum)
 240 {
 241         struct sigaction act;
 242
 243         memset(&act, 0, sizeof(act));
 244
 245         act.sa_handler = SIG_IGN;
 246         sigemptyset(&act.sa_mask);
 247         sigaddset(&act.sa_mask, signum);
 248         sigaction(signum, &act, NULL);
 249 }
 250
 251
 252 /*
 253   send a packet to a client
 254  */
 255 static int daemon_queue_send(struct ctdb_client *client, struct ctdb_req_header *hdr)
 256 {
 257         CTDB_INCREMENT_STAT(client->ctdb, client_packets_sent);
 258         if (hdr->operation == CTDB_REQ_MESSAGE) {
 259                 if (ctdb_queue_length(client->queue) > client->ctdb->tunable.max_queue_depth_drop_msg) {
 260                         DEBUG(DEBUG_ERR,("CTDB_REQ_MESSAGE queue full - killing client connection.\n"));
 261                         talloc_free(client);
 262                         return -1;
 263                 }
 264         }
 265         return ctdb_queue_send(client->queue, (uint8_t *)hdr, hdr->length);
 266 }
 267
 268 /*
 269   message handler for when we are in daemon mode. This redirects the message
 270   to the right client
 271  */
 272 static void daemon_message_handler(uint64_t srvid, TDB_DATA data,
 273                                    void *private_data)
 274 {
 275         struct ctdb_client *client = talloc_get_type(private_data, struct ctdb_client);
 276         struct ctdb_req_message_old *r;
 277         int len;
 278
 279         /* construct a message to send to the client containing the data */
 280         len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
 281         r = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_MESSAGE,
 282                                len, struct ctdb_req_message_old);
 283         CTDB_NO_MEMORY_VOID(client->ctdb, r);
 284
 285         talloc_set_name_const(r, "req_message packet");
 286
 287         r->srvid         = srvid;
 288         r->datalen       = data.dsize;
 289         memcpy(&r->data[0], data.dptr, data.dsize);
 290
 291         daemon_queue_send(client, &r->hdr);
 292
 293         talloc_free(r);
 294 }
 295
 296 /*
 297   this is called when the ctdb daemon received a ctdb request to
 298   set the srvid from the client
 299  */
 300 int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
 301 {
 302         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
 303         int res;
 304         if (client == NULL) {
 305                 DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_register_message_handler\n"));
 306                 return -1;
 307         }
 308         res = srvid_register(ctdb->srv, client, srvid, daemon_message_handler,
 309                              client);
 310         if (res != 0) {
 311                 DEBUG(DEBUG_ERR,(__location__ " Failed to register handler %llu in daemon\n",
 312                          (unsigned long long)srvid));
 313         } else {
 314                 DEBUG(DEBUG_INFO,(__location__ " Registered message handler for srvid=%llu\n",
 315                          (unsigned long long)srvid));
 316         }
 317
 318         return res;
 319 }
 320
 321 /*
 322   this is called when the ctdb daemon received a ctdb request to
 323   remove a srvid from the client
 324  */
 325 int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
 326 {
 327         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
 328         if (client == NULL) {
 329                 DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_deregister_message_handler\n"));
 330                 return -1;
 331         }
 332         return srvid_deregister(ctdb->srv, srvid, client);
 333 }
 334
 335 void daemon_tunnel_handler(uint64_t tunnel_id, TDB_DATA data,
 336                            void *private_data)
 337 {
 338         struct ctdb_client *client =
 339                 talloc_get_type_abort(private_data, struct ctdb_client);
 340         struct ctdb_req_tunnel_old *c, *pkt;
 341         size_t len;
 342
 343         pkt = (struct ctdb_req_tunnel_old *)data.dptr;
 344
 345         len = offsetof(struct ctdb_req_tunnel_old, data) + pkt->datalen;
 346         c = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_TUNNEL,
 347                                len, struct ctdb_req_tunnel_old);
 348         if (c == NULL) {
 349                 DEBUG(DEBUG_ERR, ("Memory error in daemon_tunnel_handler\n"));
 350                 return;
 351         }
 352
 353         talloc_set_name_const(c, "req_tunnel packet");
 354
 355         c->tunnel_id = tunnel_id;
 356         c->flags = pkt->flags;
 357         c->datalen = pkt->datalen;
 358         memcpy(c->data, pkt->data, pkt->datalen);
 359
 360         daemon_queue_send(client, &c->hdr);
 361
 362         talloc_free(c);
 363 }
 364
 365 /*
 366   destroy a ctdb_client
 367 */
 368 static int ctdb_client_destructor(struct ctdb_client *client)
 369 {
 370         struct ctdb_db_context *ctdb_db;
 371
 372         ctdb_takeover_client_destructor_hook(client);
 373         reqid_remove(client->ctdb->idr, client->client_id);
 374         client->ctdb->num_clients--;
 375
 376         if (client->num_persistent_updates != 0) {
 377                 DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates));
 378                 client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
 379         }
 380         ctdb_db = find_ctdb_db(client->ctdb, client->db_id);
 381         if (ctdb_db) {
 382                 DEBUG(DEBUG_ERR, (__location__ " client exit while transaction "
 383                                   "commit active. Forcing recovery.\n"));
 384                 client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
 385
 386                 /*
 387                  * trans3 transaction state:
 388                  *
 389                  * The destructor sets the pointer to NULL.
 390                  */
 391                 talloc_free(ctdb_db->persistent_state);
 392         }
 393
 394         return 0;
 395 }
 396
 397
 398 /*
 399   this is called when the ctdb daemon received a ctdb request message
 400   from a local client over the unix domain socket
 401  */
 402 static void daemon_request_message_from_client(struct ctdb_client *client,
 403                                                struct ctdb_req_message_old *c)
 404 {
 405         TDB_DATA data;
 406         int res;
 407
 408         if (c->hdr.destnode == CTDB_CURRENT_NODE) {
 409                 c->hdr.destnode = ctdb_get_pnn(client->ctdb);
 410         }
 411
 412         /* maybe the message is for another client on this node */
 413         if (ctdb_get_pnn(client->ctdb)==c->hdr.destnode) {
 414                 ctdb_request_message(client->ctdb, (struct ctdb_req_header *)c);
 415                 return;
 416         }
 417
 418         /* its for a remote node */
 419         data.dptr = &c->data[0];
 420         data.dsize = c->datalen;
 421         res = ctdb_daemon_send_message(client->ctdb, c->hdr.destnode,
 422                                        c->srvid, data);
 423         if (res != 0) {
 424                 DEBUG(DEBUG_ERR,(__location__ " Failed to send message to remote node %u\n",
 425                          c->hdr.destnode));
 426         }
 427 }
 428
 429
 430 struct daemon_call_state {
 431         struct ctdb_client *client;
 432         uint32_t reqid;
 433         struct ctdb_call *call;
 434         struct timeval start_time;
 435
 436         /* readonly request ? */
 437         uint32_t readonly_fetch;
 438         uint32_t client_callid;
 439 };
 440
 441 /*
 442    complete a call from a client
 443 */
 444 static void daemon_call_from_client_callback(struct ctdb_call_state *state)
 445 {
 446         struct daemon_call_state *dstate = talloc_get_type(state->async.private_data,
 447                                                            struct daemon_call_state);
 448         struct ctdb_reply_call_old *r;
 449         int res;
 450         uint32_t length;
 451         struct ctdb_client *client = dstate->client;
 452         struct ctdb_db_context *ctdb_db = state->ctdb_db;
 453
 454         talloc_steal(client, dstate);
 455         talloc_steal(dstate, dstate->call);
 456
 457         res = ctdb_daemon_call_recv(state, dstate->call);
 458         if (res != 0) {
 459                 DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n"));
 460                 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
 461
 462                 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 1", call_latency, dstate->start_time);
 463                 return;
 464         }
 465
 466         length = offsetof(struct ctdb_reply_call_old, data) + dstate->call->reply_data.dsize;
 467         /* If the client asked for readonly FETCH, we remapped this to
 468            FETCH_WITH_HEADER when calling the daemon. So we must
 469            strip the extra header off the reply data before passing
 470            it back to the client.
 471         */
 472         if (dstate->readonly_fetch
 473         && dstate->client_callid == CTDB_FETCH_FUNC) {
 474                 length -= sizeof(struct ctdb_ltdb_header);
 475         }
 476
 477         r = ctdbd_allocate_pkt(client->ctdb, dstate, CTDB_REPLY_CALL,
 478                                length, struct ctdb_reply_call_old);
 479         if (r == NULL) {
 480                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n"));
 481                 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
 482                 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 2", call_latency, dstate->start_time);
 483                 return;
 484         }
 485         r->hdr.reqid        = dstate->reqid;
 486         r->status           = dstate->call->status;
 487
 488         if (dstate->readonly_fetch
 489         && dstate->client_callid == CTDB_FETCH_FUNC) {
 490                 /* client only asked for a FETCH so we must strip off
 491                    the extra ctdb_ltdb header
 492                 */
 493                 r->datalen          = dstate->call->reply_data.dsize - sizeof(struct ctdb_ltdb_header);
 494                 memcpy(&r->data[0], dstate->call->reply_data.dptr + sizeof(struct ctdb_ltdb_header), r->datalen);
 495         } else {
 496                 r->datalen          = dstate->call->reply_data.dsize;
 497                 memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen);
 498         }
 499
 500         res = daemon_queue_send(client, &r->hdr);
 501         if (res == -1) {
 502                 /* client is dead - return immediately */
 503                 return;
 504         }
 505         if (res != 0) {
 506                 DEBUG(DEBUG_ERR, (__location__ " Failed to queue packet from daemon to client\n"));
 507         }
 508         CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 3", call_latency, dstate->start_time);
 509         CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
 510         talloc_free(dstate);
 511 }
 512
 513 struct ctdb_daemon_packet_wrap {
 514         struct ctdb_context *ctdb;
 515         uint32_t client_id;
 516 };
 517
 518 /*
 519   a wrapper to catch disconnected clients
 520  */
 521 static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr)
 522 {
 523         struct ctdb_client *client;
 524         struct ctdb_daemon_packet_wrap *w = talloc_get_type(p,
 525                                                             struct ctdb_daemon_packet_wrap);
 526         if (w == NULL) {
 527                 DEBUG(DEBUG_CRIT,(__location__ " Bad packet type '%s'\n", talloc_get_name(p)));
 528                 return;
 529         }
 530
 531         client = reqid_find(w->ctdb->idr, w->client_id, struct ctdb_client);
 532         if (client == NULL) {
 533                 DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
 534                          w->client_id));
 535                 talloc_free(w);
 536                 return;
 537         }
 538         talloc_free(w);
 539
 540         /* process it */
 541         daemon_incoming_packet(client, hdr);
 542 }
 543
 544 struct ctdb_deferred_fetch_call {
 545         struct ctdb_deferred_fetch_call *next, *prev;
 546         struct ctdb_req_call_old *c;
 547         struct ctdb_daemon_packet_wrap *w;
 548 };
 549
 550 struct ctdb_deferred_fetch_queue {
 551         struct ctdb_deferred_fetch_call *deferred_calls;
 552 };
 553
 554 struct ctdb_deferred_requeue {
 555         struct ctdb_deferred_fetch_call *dfc;
 556         struct ctdb_client *client;
 557 };
 558
 559 /* called from a timer event and starts reprocessing the deferred call.*/
 560 static void reprocess_deferred_call(struct tevent_context *ev,
 561                                     struct tevent_timer *te,
 562                                     struct timeval t, void *private_data)
 563 {
 564         struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data;
 565         struct ctdb_client *client = dfr->client;
 566
 567         talloc_steal(client, dfr->dfc->c);
 568         daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c);
 569         talloc_free(dfr);
 570 }
 571
 572 /* the referral context is destroyed either after a timeout or when the initial
 573    fetch-lock has finished.
 574    at this stage, immediately start reprocessing the queued up deferred
 575    calls so they get reprocessed immediately (and since we are dmaster at
 576    this stage, trigger the waiting smbd processes to pick up and acquire the
 577    record right away.
 578 */
 579 static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq)
 580 {
 581
 582         /* need to reprocess the packets from the queue explicitly instead of
 583            just using a normal destructor since we need to
 584            call the clients in the same order as the requests queued up
 585         */
 586         while (dfq->deferred_calls != NULL) {
 587                 struct ctdb_client *client;
 588                 struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls;
 589                 struct ctdb_deferred_requeue *dfr;
 590
 591                 DLIST_REMOVE(dfq->deferred_calls, dfc);
 592
 593                 client = reqid_find(dfc->w->ctdb->idr, dfc->w->client_id, struct ctdb_client);
 594                 if (client == NULL) {
 595                         DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
 596                                  dfc->w->client_id));
 597                         continue;
 598                 }
 599
 600                 /* process it by pushing it back onto the eventloop */
 601                 dfr = talloc(client, struct ctdb_deferred_requeue);
 602                 if (dfr == NULL) {
 603                         DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n"));
 604                         continue;
 605                 }
 606
 607                 dfr->dfc    = talloc_steal(dfr, dfc);
 608                 dfr->client = client;
 609
 610                 tevent_add_timer(dfc->w->ctdb->ev, client, timeval_zero(),
 611                                  reprocess_deferred_call, dfr);
 612         }
 613
 614         return 0;
 615 }
 616
 617 /* insert the new deferral context into the rb tree.
 618    there should never be a pre-existing context here, but check for it
 619    warn and destroy the previous context if there is already a deferral context
 620    for this key.
 621 */
 622 static void *insert_dfq_callback(void *parm, void *data)
 623 {
 624         if (data) {
 625                 DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm));
 626                 talloc_free(data);
 627         }
 628         return parm;
 629 }
 630
 631 /* if the original fetch-lock did not complete within a reasonable time,
 632    free the context and context for all deferred requests to cause them to be
 633    re-inserted into the event system.
 634 */
 635 static void dfq_timeout(struct tevent_context *ev, struct tevent_timer *te,
 636                         struct timeval t, void *private_data)
 637 {
 638         talloc_free(private_data);
 639 }
 640
 641 /* This function is used in the local daemon to register a KEY in a database
 642    for being "fetched"
 643    While the remote fetch is in-flight, any futher attempts to re-fetch the
 644    same record will be deferred until the fetch completes.
 645 */
 646 static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
 647 {
 648         uint32_t *k;
 649         struct ctdb_deferred_fetch_queue *dfq;
 650
 651         k = ctdb_key_to_idkey(call, call->key);
 652         if (k == NULL) {
 653                 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
 654                 return -1;
 655         }
 656
 657         dfq  = talloc(call, struct ctdb_deferred_fetch_queue);
 658         if (dfq == NULL) {
 659                 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n"));
 660                 talloc_free(k);
 661                 return -1;
 662         }
 663         dfq->deferred_calls = NULL;
 664
 665         trbt_insertarray32_callback(ctdb_db->deferred_fetch, k[0], &k[0], insert_dfq_callback, dfq);
 666
 667         talloc_set_destructor(dfq, deferred_fetch_queue_destructor);
 668
 669         /* if the fetch havent completed in 30 seconds, just tear it all down
 670            and let it try again as the events are reissued */
 671         tevent_add_timer(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0),
 672                          dfq_timeout, dfq);
 673
 674         talloc_free(k);
 675         return 0;
 676 }
 677
 678 /* check if this is a duplicate request to a fetch already in-flight
 679    if it is, make this call deferred to be reprocessed later when
 680    the in-flight fetch completes.
 681 */
 682 static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call_old *c)
 683 {
 684         uint32_t *k;
 685         struct ctdb_deferred_fetch_queue *dfq;
 686         struct ctdb_deferred_fetch_call *dfc;
 687
 688         k = ctdb_key_to_idkey(c, key);
 689         if (k == NULL) {
 690                 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
 691                 return -1;
 692         }
 693
 694         dfq = trbt_lookuparray32(ctdb_db->deferred_fetch, k[0], &k[0]);
 695         if (dfq == NULL) {
 696                 talloc_free(k);
 697                 return -1;
 698         }
 699
 700
 701         talloc_free(k);
 702
 703         dfc = talloc(dfq, struct ctdb_deferred_fetch_call);
 704         if (dfc == NULL) {
 705                 DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n"));
 706                 return -1;
 707         }
 708
 709         dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap);
 710         if (dfc->w == NULL) {
 711                 DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n"));
 712                 talloc_free(dfc);
 713                 return -1;
 714         }
 715
 716         dfc->c = talloc_steal(dfc, c);
 717         dfc->w->ctdb = ctdb_db->ctdb;
 718         dfc->w->client_id = client->client_id;
 719
 720         DLIST_ADD_END(dfq->deferred_calls, dfc);
 721
 722         return 0;
 723 }
 724
 725
 726 /*
 727   this is called when the ctdb daemon received a ctdb request call
 728   from a local client over the unix domain socket
 729  */
 730 static void daemon_request_call_from_client(struct ctdb_client *client,
 731                                             struct ctdb_req_call_old *c)
 732 {
 733         struct ctdb_call_state *state;
 734         struct ctdb_db_context *ctdb_db;
 735         struct daemon_call_state *dstate;
 736         struct ctdb_call *call;
 737         struct ctdb_ltdb_header header;
 738         TDB_DATA key, data;
 739         int ret;
 740         struct ctdb_context *ctdb = client->ctdb;
 741         struct ctdb_daemon_packet_wrap *w;
 742
 743         CTDB_INCREMENT_STAT(ctdb, total_calls);
 744         CTDB_INCREMENT_STAT(ctdb, pending_calls);
 745
 746         ctdb_db = find_ctdb_db(client->ctdb, c->db_id);
 747         if (!ctdb_db) {
 748                 DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x",
 749                           c->db_id));
 750                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 751                 return;
 752         }
 753
 754         if (ctdb_db->unhealthy_reason) {
 755                 /*
 756                  * this is just a warning, as the tdb should be empty anyway,
 757                  * and only persistent databases can be unhealthy, which doesn't
 758                  * use this code patch
 759                  */
 760                 DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n",
 761                                      ctdb_db->db_name, ctdb_db->unhealthy_reason));
 762         }
 763
 764         key.dptr = c->data;
 765         key.dsize = c->keylen;
 766
 767         w = talloc(ctdb, struct ctdb_daemon_packet_wrap);
 768         CTDB_NO_MEMORY_VOID(ctdb, w);
 769
 770         w->ctdb = ctdb;
 771         w->client_id = client->client_id;
 772
 773         ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header,
 774                                            (struct ctdb_req_header *)c, &data,
 775                                            daemon_incoming_packet_wrap, w, true);
 776         if (ret == -2) {
 777                 /* will retry later */
 778                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 779                 return;
 780         }
 781
 782         talloc_free(w);
 783
 784         if (ret != 0) {
 785                 DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n"));
 786                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 787                 return;
 788         }
 789
 790
 791         /* check if this fetch request is a duplicate for a
 792            request we already have in flight. If so defer it until
 793            the first request completes.
 794         */
 795         if (ctdb->tunable.fetch_collapse == 1) {
 796                 if (requeue_duplicate_fetch(ctdb_db, client, key, c) == 0) {
 797                         ret = ctdb_ltdb_unlock(ctdb_db, key);
 798                         if (ret != 0) {
 799                                 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 800                         }
 801                         CTDB_DECREMENT_STAT(ctdb, pending_calls);
 802                         talloc_free(data.dptr);
 803                         return;
 804                 }
 805         }
 806
 807         /* Dont do READONLY if we don't have a tracking database */
 808         if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) {
 809                 c->flags &= ~CTDB_WANT_READONLY;
 810         }
 811
 812         if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
 813                 header.flags &= ~CTDB_REC_RO_FLAGS;
 814                 CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
 815                 CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
 816                 if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
 817                         ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
 818                 }
 819                 /* and clear out the tracking data */
 820                 if (tdb_delete(ctdb_db->rottdb, key) != 0) {
 821                         DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
 822                 }
 823         }
 824
 825         /* if we are revoking, we must defer all other calls until the revoke
 826          * had completed.
 827          */
 828         if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
 829                 talloc_free(data.dptr);
 830                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 831
 832                 if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
 833                         ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
 834                 }
 835                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 836                 return;
 837         }
 838
 839         if ((header.dmaster == ctdb->pnn)
 840         && (!(c->flags & CTDB_WANT_READONLY))
 841         && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
 842                 header.flags   |= CTDB_REC_RO_REVOKING_READONLY;
 843                 if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
 844                         ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
 845                 }
 846                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 847
 848                 if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) {
 849                         ctdb_fatal(ctdb, "Failed to start record revoke");
 850                 }
 851                 talloc_free(data.dptr);
 852
 853                 if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
 854                         ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
 855                 }
 856
 857                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 858                 return;
 859         }
 860
 861         dstate = talloc(client, struct daemon_call_state);
 862         if (dstate == NULL) {
 863                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 864                 if (ret != 0) {
 865                         DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 866                 }
 867
 868                 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n"));
 869                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 870                 return;
 871         }
 872         dstate->start_time = timeval_current();
 873         dstate->client = client;
 874         dstate->reqid  = c->hdr.reqid;
 875         talloc_steal(dstate, data.dptr);
 876
 877         call = dstate->call = talloc_zero(dstate, struct ctdb_call);
 878         if (call == NULL) {
 879                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 880                 if (ret != 0) {
 881                         DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 882                 }
 883
 884                 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n"));
 885                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 886                 CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 1", call_latency, dstate->start_time);
 887                 return;
 888         }
 889
 890         dstate->readonly_fetch = 0;
 891         call->call_id = c->callid;
 892         call->key = key;
 893         call->call_data.dptr = c->data + c->keylen;
 894         call->call_data.dsize = c->calldatalen;
 895         call->flags = c->flags;
 896
 897         if (c->flags & CTDB_WANT_READONLY) {
 898                 /* client wants readonly record, so translate this into a
 899                    fetch with header. remember what the client asked for
 900                    so we can remap the reply back to the proper format for
 901                    the client in the reply
 902                  */
 903                 dstate->client_callid = call->call_id;
 904                 call->call_id = CTDB_FETCH_WITH_HEADER_FUNC;
 905                 dstate->readonly_fetch = 1;
 906         }
 907
 908         if (header.dmaster == ctdb->pnn) {
 909                 state = ctdb_call_local_send(ctdb_db, call, &header, &data);
 910         } else {
 911                 state = ctdb_daemon_call_send_remote(ctdb_db, call, &header);
 912                 if (ctdb->tunable.fetch_collapse == 1) {
 913                         /* This request triggered a remote fetch-lock.
 914                            set up a deferral for this key so any additional
 915                            fetch-locks are deferred until the current one
 916                            finishes.
 917                          */
 918                         setup_deferred_fetch_locks(ctdb_db, call);
 919                 }
 920         }
 921
 922         ret = ctdb_ltdb_unlock(ctdb_db, key);
 923         if (ret != 0) {
 924                 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 925         }
 926
 927         if (state == NULL) {
 928                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n"));
 929                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 930                 CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 2", call_latency, dstate->start_time);
 931                 return;
 932         }
 933         talloc_steal(state, dstate);
 934         talloc_steal(client, state);
 935
 936         state->async.fn = daemon_call_from_client_callback;
 937         state->async.private_data = dstate;
 938 }
 939
 940
 941 static void daemon_request_control_from_client(struct ctdb_client *client,
 942                                                struct ctdb_req_control_old *c);
 943 static void daemon_request_tunnel_from_client(struct ctdb_client *client,
 944                                               struct ctdb_req_tunnel_old *c);
 945
 946 /* data contains a packet from the client */
 947 static void daemon_incoming_packet(void *p, struct ctdb_req_header *hdr)
 948 {
 949         struct ctdb_client *client = talloc_get_type(p, struct ctdb_client);
 950         TALLOC_CTX *tmp_ctx;
 951         struct ctdb_context *ctdb = client->ctdb;
 952
 953         /* place the packet as a child of a tmp_ctx. We then use
 954            talloc_free() below to free it. If any of the calls want
 955            to keep it, then they will steal it somewhere else, and the
 956            talloc_free() will be a no-op */
 957         tmp_ctx = talloc_new(client);
 958         talloc_steal(tmp_ctx, hdr);
 959
 960         if (hdr->ctdb_magic != CTDB_MAGIC) {
 961                 ctdb_set_error(client->ctdb, "Non CTDB packet rejected in daemon\n");
 962                 goto done;
 963         }
 964
 965         if (hdr->ctdb_version != CTDB_PROTOCOL) {
 966                 ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
 967                 goto done;
 968         }
 969
 970         switch (hdr->operation) {
 971         case CTDB_REQ_CALL:
 972                 CTDB_INCREMENT_STAT(ctdb, client.req_call);
 973                 daemon_request_call_from_client(client, (struct ctdb_req_call_old *)hdr);
 974                 break;
 975
 976         case CTDB_REQ_MESSAGE:
 977                 CTDB_INCREMENT_STAT(ctdb, client.req_message);
 978                 daemon_request_message_from_client(client, (struct ctdb_req_message_old *)hdr);
 979                 break;
 980
 981         case CTDB_REQ_CONTROL:
 982                 CTDB_INCREMENT_STAT(ctdb, client.req_control);
 983                 daemon_request_control_from_client(client, (struct ctdb_req_control_old *)hdr);
 984                 break;
 985
 986         case CTDB_REQ_TUNNEL:
 987                 CTDB_INCREMENT_STAT(ctdb, client.req_tunnel);
 988                 daemon_request_tunnel_from_client(client, (struct ctdb_req_tunnel_old *)hdr);
 989                 break;
 990
 991         default:
 992                 DEBUG(DEBUG_CRIT,(__location__ " daemon: unrecognized operation %u\n",
 993                          hdr->operation));
 994         }
 995
 996 done:
 997         talloc_free(tmp_ctx);
 998 }
 999
1000 /*
1001   called when the daemon gets a incoming packet
1002  */
1003 static void ctdb_daemon_read_cb(uint8_t *data, size_t cnt, void *args)
1004 {
1005         struct ctdb_client *client = talloc_get_type(args, struct ctdb_client);
1006         struct ctdb_req_header *hdr;
1007
1008         if (cnt == 0) {
1009                 talloc_free(client);
1010                 return;
1011         }
1012
1013         CTDB_INCREMENT_STAT(client->ctdb, client_packets_recv);
1014
1015         if (cnt < sizeof(*hdr)) {
1016                 ctdb_set_error(client->ctdb, "Bad packet length %u in daemon\n",
1017                                (unsigned)cnt);
1018                 return;
1019         }
1020         hdr = (struct ctdb_req_header *)data;
1021
1022         if (hdr->ctdb_magic != CTDB_MAGIC) {
1023                 ctdb_set_error(client->ctdb, "Non CTDB packet rejected\n");
1024                 goto err_out;
1025         }
1026
1027         if (hdr->ctdb_version != CTDB_PROTOCOL) {
1028                 ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
1029                 goto err_out;
1030         }
1031
1032         DEBUG(DEBUG_DEBUG,(__location__ " client request %u of type %u length %u from "
1033                  "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
1034                  hdr->srcnode, hdr->destnode));
1035
1036         /* it is the responsibility of the incoming packet function to free 'data' */
1037         daemon_incoming_packet(client, hdr);
1038         return;
1039
1040 err_out:
1041         TALLOC_FREE(data);
1042 }
1043
1044
1045 static int ctdb_clientpid_destructor(struct ctdb_client_pid_list *client_pid)
1046 {
1047         if (client_pid->ctdb->client_pids != NULL) {
1048                 DLIST_REMOVE(client_pid->ctdb->client_pids, client_pid);
1049         }
1050
1051         return 0;
1052 }
1053
1054 static int get_new_client_id(struct reqid_context *idr,
1055                              struct ctdb_client *client,
1056                              uint32_t *out)
1057 {
1058         uint32_t client_id;
1059
1060         client_id = reqid_new(idr, client);
1061         /*
1062          * Some places in the code (e.g. ctdb_control_db_attach(),
1063          * ctdb_control_db_detach()) assign a special meaning to
1064          * client_id 0.  The assumption is that if client_id is 0 then
1065          * the control has come from another daemon.  Therefore, we
1066          * should never return client_id == 0.
1067          */
1068         if (client_id == 0) {
1069                 /*
1070                  * Don't leak ID 0.  This is safe because the ID keeps
1071                  * increasing.  A test will be added to ensure that
1072                  * this doesn't change.
1073                  */
1074                 reqid_remove(idr, 0);
1075
1076                 client_id = reqid_new(idr, client);
1077         }
1078
1079         if (client_id == REQID_INVALID) {
1080                 return EINVAL;
1081         }
1082
1083         if (client_id == 0) {
1084                 /* Every other ID must have been used and we can't use 0 */
1085                 reqid_remove(idr, 0);
1086                 return EINVAL;
1087         }
1088
1089         *out = client_id;
1090         return 0;
1091 }
1092
1093 static void ctdb_accept_client(struct tevent_context *ev,
1094                                struct tevent_fd *fde, uint16_t flags,
1095                                void *private_data)
1096 {
1097         struct sockaddr_un addr;
1098         socklen_t len;
1099         int fd;
1100         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
1101         struct ctdb_client *client;
1102         struct ctdb_client_pid_list *client_pid;
1103         pid_t peer_pid = 0;
1104         int ret;
1105
1106         memset(&addr, 0, sizeof(addr));
1107         len = sizeof(addr);
1108         fd = accept(ctdb->daemon.sd, (struct sockaddr *)&addr, &len);
1109         if (fd == -1) {
1110                 return;
1111         }
1112         smb_set_close_on_exec(fd);
1113
1114         ret = set_blocking(fd, false);
1115         if (ret != 0) {
1116                 DEBUG(DEBUG_ERR,
1117                       (__location__
1118                        " failed to set socket non-blocking (%s)\n",
1119                        strerror(errno)));
1120                 close(fd);
1121                 return;
1122         }
1123
1124         set_close_on_exec(fd);
1125
1126         DEBUG(DEBUG_DEBUG,(__location__ " Created SOCKET FD:%d to connected child\n", fd));
1127
1128         client = talloc_zero(ctdb, struct ctdb_client);
1129         if (ctdb_get_peer_pid(fd, &peer_pid) == 0) {
1130                 DEBUG(DEBUG_INFO,("Connected client with pid:%u\n", (unsigned)peer_pid));
1131         }
1132
1133         client->ctdb = ctdb;
1134         client->fd = fd;
1135
1136         ret = get_new_client_id(ctdb->idr, client, &client->client_id);
1137         if (ret != 0) {
1138                 DBG_ERR("Unable to get client ID (%d)\n", ret);
1139                 close(fd);
1140                 talloc_free(client);
1141                 return;
1142         }
1143
1144         client->pid = peer_pid;
1145
1146         client_pid = talloc(client, struct ctdb_client_pid_list);
1147         if (client_pid == NULL) {
1148                 DEBUG(DEBUG_ERR,("Failed to allocate client pid structure\n"));
1149                 close(fd);
1150                 talloc_free(client);
1151                 return;
1152         }
1153         client_pid->ctdb   = ctdb;
1154         client_pid->pid    = peer_pid;
1155         client_pid->client = client;
1156
1157         DLIST_ADD(ctdb->client_pids, client_pid);
1158
1159         client->queue = ctdb_queue_setup(ctdb, client, fd, CTDB_DS_ALIGNMENT,
1160                                          ctdb_daemon_read_cb, client,
1161                                          "client-%u", client->pid);
1162
1163         talloc_set_destructor(client, ctdb_client_destructor);
1164         talloc_set_destructor(client_pid, ctdb_clientpid_destructor);
1165         ctdb->num_clients++;
1166 }
1167
1168
1169
1170 /*
1171   create a unix domain socket and bind it
1172   return a file descriptor open on the socket
1173 */
1174 static int ux_socket_bind(struct ctdb_context *ctdb)
1175 {
1176         struct sockaddr_un addr = { .sun_family = AF_UNIX };
1177         int ret;
1178
1179         ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
1180         if (ctdb->daemon.sd == -1) {
1181                 return -1;
1182         }
1183
1184         strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1);
1185
1186         if (! sock_clean(ctdb->daemon.name)) {
1187                 return -1;
1188         }
1189
1190         set_close_on_exec(ctdb->daemon.sd);
1191
1192         ret = set_blocking(ctdb->daemon.sd, false);
1193         if (ret != 0) {
1194                 DEBUG(DEBUG_ERR,
1195                       (__location__
1196                        " failed to set socket non-blocking (%s)\n",
1197                        strerror(errno)));
1198                 goto failed;
1199         }
1200
1201         if (bind(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
1202                 DEBUG(DEBUG_CRIT,("Unable to bind on ctdb socket '%s'\n", ctdb->daemon.name));
1203                 goto failed;
1204         }
1205
1206         if (chown(ctdb->daemon.name, geteuid(), getegid()) != 0 ||
1207             chmod(ctdb->daemon.name, 0700) != 0) {
1208                 DEBUG(DEBUG_CRIT,("Unable to secure ctdb socket '%s', ctdb->daemon.name\n", ctdb->daemon.name));
1209                 goto failed;
1210         }
1211
1212
1213         if (listen(ctdb->daemon.sd, 100) != 0) {
1214                 DEBUG(DEBUG_CRIT,("Unable to listen on ctdb socket '%s'\n", ctdb->daemon.name));
1215                 goto failed;
1216         }
1217
1218         DEBUG(DEBUG_NOTICE, ("Listening to ctdb socket %s\n",
1219                              ctdb->daemon.name));
1220         return 0;
1221
1222 failed:
1223         close(ctdb->daemon.sd);
1224         ctdb->daemon.sd = -1;
1225         return -1;
1226 }
1227
1228 static void initialise_node_flags (struct ctdb_context *ctdb)
1229 {
1230         unsigned int i;
1231
1232         /* Always found: PNN correctly set just before this is called */
1233         for (i = 0; i < ctdb->num_nodes; i++) {
1234                 if (ctdb->pnn == ctdb->nodes[i]->pnn) {
1235                         break;
1236                 }
1237         }
1238
1239         ctdb->nodes[i]->flags &= ~NODE_FLAGS_DISCONNECTED;
1240
1241         /* do we start out in DISABLED mode? */
1242         if (ctdb->start_as_disabled != 0) {
1243                 D_ERR("This node is configured to start in DISABLED state\n");
1244                 ctdb->nodes[i]->flags |= NODE_FLAGS_DISABLED;
1245         }
1246         /* do we start out in STOPPED mode? */
1247         if (ctdb->start_as_stopped != 0) {
1248                 D_ERR("This node is configured to start in STOPPED state\n");
1249                 ctdb->nodes[i]->flags |= NODE_FLAGS_STOPPED;
1250         }
1251 }
1252
1253 static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
1254                                       void *private_data)
1255 {
1256         if (status != 0) {
1257                 ctdb_die(ctdb, "Failed to run setup event");
1258         }
1259         ctdb_run_notification_script(ctdb, "setup");
1260
1261         /* Start the recovery daemon */
1262         if (ctdb_start_recoverd(ctdb) != 0) {
1263                 DEBUG(DEBUG_ALERT,("Failed to start recovery daemon\n"));
1264                 exit(11);
1265         }
1266
1267         ctdb_start_periodic_events(ctdb);
1268
1269         ctdb_wait_for_first_recovery(ctdb);
1270 }
1271
1272 static struct timeval tevent_before_wait_ts;
1273 static struct timeval tevent_after_wait_ts;
1274
1275 static void ctdb_tevent_trace_init(void)
1276 {
1277         struct timeval now;
1278
1279         now = timeval_current();
1280
1281         tevent_before_wait_ts = now;
1282         tevent_after_wait_ts = now;
1283 }
1284
1285 static void ctdb_tevent_trace(enum tevent_trace_point tp,
1286                               void *private_data)
1287 {
1288         struct timeval diff;
1289         struct timeval now;
1290         struct ctdb_context *ctdb =
1291                 talloc_get_type(private_data, struct ctdb_context);
1292
1293         if (getpid() != ctdb->ctdbd_pid) {
1294                 return;
1295         }
1296
1297         now = timeval_current();
1298
1299         switch (tp) {
1300         case TEVENT_TRACE_BEFORE_WAIT:
1301                 diff = timeval_until(&tevent_after_wait_ts, &now);
1302                 if (diff.tv_sec > 3) {
1303                         DEBUG(DEBUG_ERR,
1304                               ("Handling event took %ld seconds!\n",
1305                                (long)diff.tv_sec));
1306                 }
1307                 tevent_before_wait_ts = now;
1308                 break;
1309
1310         case TEVENT_TRACE_AFTER_WAIT:
1311                 diff = timeval_until(&tevent_before_wait_ts, &now);
1312                 if (diff.tv_sec > 3) {
1313                         DEBUG(DEBUG_ERR,
1314                               ("No event for %ld seconds!\n",
1315                                (long)diff.tv_sec));
1316                 }
1317                 tevent_after_wait_ts = now;
1318                 break;
1319
1320         default:
1321                 /* Do nothing for future tevent trace points */ ;
1322         }
1323 }
1324
1325 static void ctdb_remove_pidfile(void)
1326 {
1327         TALLOC_FREE(ctdbd_pidfile_ctx);
1328 }
1329
1330 static void ctdb_create_pidfile(TALLOC_CTX *mem_ctx)
1331 {
1332         if (ctdbd_pidfile != NULL) {
1333                 int ret = pidfile_context_create(mem_ctx, ctdbd_pidfile,
1334                                                  &ctdbd_pidfile_ctx);
1335                 if (ret != 0) {
1336                         DEBUG(DEBUG_ERR,
1337                               ("Failed to create PID file %s\n",
1338                                ctdbd_pidfile));
1339                         exit(11);
1340                 }
1341
1342                 DEBUG(DEBUG_NOTICE, ("Created PID file %s\n", ctdbd_pidfile));
1343                 atexit(ctdb_remove_pidfile);
1344         }
1345 }
1346
1347 static void ctdb_initialise_vnn_map(struct ctdb_context *ctdb)
1348 {
1349         unsigned int i, j, count;
1350
1351         /* initialize the vnn mapping table, skipping any deleted nodes */
1352         ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
1353         CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map);
1354
1355         count = 0;
1356         for (i = 0; i < ctdb->num_nodes; i++) {
1357                 if ((ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) == 0) {
1358                         count++;
1359                 }
1360         }
1361
1362         ctdb->vnn_map->generation = INVALID_GENERATION;
1363         ctdb->vnn_map->size = count;
1364         ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, ctdb->vnn_map->size);
1365         CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map->map);
1366
1367         for(i=0, j=0; i < ctdb->vnn_map->size; i++) {
1368                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1369                         continue;
1370                 }
1371                 ctdb->vnn_map->map[j] = i;
1372                 j++;
1373         }
1374 }
1375
1376 static void ctdb_set_my_pnn(struct ctdb_context *ctdb)
1377 {
1378         if (ctdb->address == NULL) {
1379                 ctdb_fatal(ctdb,
1380                            "Can not determine PNN - node address is not set\n");
1381         }
1382
1383         ctdb->pnn = ctdb_ip_to_pnn(ctdb, ctdb->address);
1384         if (ctdb->pnn == CTDB_UNKNOWN_PNN) {
1385                 ctdb_fatal(ctdb,
1386                            "Can not determine PNN - unknown node address\n");
1387         }
1388
1389         D_NOTICE("PNN is %u\n", ctdb->pnn);
1390 }
1391
1392 static void stdin_handler(struct tevent_context *ev,
1393                           struct tevent_fd *fde,
1394                           uint16_t flags,
1395                           void *private_data)
1396 {
1397         struct ctdb_context *ctdb = talloc_get_type_abort(
1398                 private_data, struct ctdb_context);
1399         ssize_t nread;
1400         char c;
1401
1402         nread = read(STDIN_FILENO, &c, 1);
1403         if (nread != 1) {
1404                 D_ERR("stdin closed, exiting\n");
1405                 talloc_free(fde);
1406                 ctdb_shutdown_sequence(ctdb, EPIPE);
1407         }
1408 }
1409
1410 static int setup_stdin_handler(struct ctdb_context *ctdb)
1411 {
1412         struct tevent_fd *fde;
1413         struct stat st;
1414         int ret;
1415
1416         ret = fstat(STDIN_FILENO, &st);
1417         if (ret != 0) {
1418                 /* Problem with stdin, ignore... */
1419                 DBG_INFO("Can't fstat() stdin\n");
1420                 return 0;
1421         }
1422
1423         if (!S_ISFIFO(st.st_mode)) {
1424                 DBG_INFO("Not a pipe...\n");
1425                 return 0;
1426         }
1427
1428         fde = tevent_add_fd(ctdb->ev,
1429                             ctdb,
1430                             STDIN_FILENO,
1431                             TEVENT_FD_READ,
1432                             stdin_handler,
1433                             ctdb);
1434         if (fde == NULL) {
1435                 return ENOMEM;
1436         }
1437
1438         DBG_INFO("Set up stdin handler\n");
1439         return 0;
1440 }
1441
1442 static void fork_only(void)
1443 {
1444         pid_t pid;
1445
1446         pid = fork();
1447         if (pid == -1) {
1448                 D_ERR("Fork failed (errno=%d)\n", errno);
1449                 exit(1);
1450         }
1451
1452         if (pid != 0) {
1453                 /* Parent simply exits... */
1454                 exit(0);
1455         }
1456 }
1457
1458 /*
1459   start the protocol going as a daemon
1460 */
1461 int ctdb_start_daemon(struct ctdb_context *ctdb,
1462                       bool interactive,
1463                       bool test_mode_enabled)
1464 {
1465         int res, ret = -1;
1466         struct tevent_fd *fde;
1467
1468         /* Fork if not interactive */
1469         if (!interactive) {
1470                 if (test_mode_enabled) {
1471                         /* Keep stdin open */
1472                         fork_only();
1473                 } else {
1474                         /* Fork, close stdin, start a session */
1475                         become_daemon(true, false, false);
1476                 }
1477         }
1478
1479         ignore_signal(SIGPIPE);
1480         ignore_signal(SIGUSR1);
1481
1482         ctdb->ctdbd_pid = getpid();
1483         DEBUG(DEBUG_ERR, ("Starting CTDBD (Version %s) as PID: %u\n",
1484                           SAMBA_VERSION_STRING, ctdb->ctdbd_pid));
1485         ctdb_create_pidfile(ctdb);
1486
1487         /* create a unix domain stream socket to listen to */
1488         res = ux_socket_bind(ctdb);
1489         if (res!=0) {
1490                 DEBUG(DEBUG_ALERT,("Cannot continue.  Exiting!\n"));
1491                 exit(10);
1492         }
1493
1494         /* Make sure we log something when the daemon terminates.
1495          * This must be the first exit handler to run (so the last to
1496          * be registered.
1497          */
1498         __ctdbd_pid = getpid();
1499         atexit(print_exit_message);
1500
1501         if (ctdb->do_setsched) {
1502                 /* try to set us up as realtime */
1503                 if (!set_scheduler()) {
1504                         exit(1);
1505                 }
1506                 DEBUG(DEBUG_NOTICE, ("Set real-time scheduler priority\n"));
1507         }
1508
1509         ctdb->ev = tevent_context_init(NULL);
1510         if (ctdb->ev == NULL) {
1511                 DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
1512                 exit(1);
1513         }
1514         tevent_loop_allow_nesting(ctdb->ev);
1515         ctdb_tevent_trace_init();
1516         tevent_set_trace_callback(ctdb->ev, ctdb_tevent_trace, ctdb);
1517
1518         /* set up a handler to pick up sigchld */
1519         if (ctdb_init_sigchld(ctdb) == NULL) {
1520                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
1521                 exit(1);
1522         }
1523
1524         if (!interactive) {
1525                 ctdb_set_child_logging(ctdb);
1526         }
1527
1528         /* Exit if stdin is closed */
1529         if (test_mode_enabled) {
1530                 ret = setup_stdin_handler(ctdb);
1531                 if (ret != 0) {
1532                         DBG_ERR("Failed to setup stdin handler\n");
1533                         exit(1);
1534                 }
1535         }
1536
1537         TALLOC_FREE(ctdb->srv);
1538         if (srvid_init(ctdb, &ctdb->srv) != 0) {
1539                 DEBUG(DEBUG_CRIT,("Failed to setup message srvid context\n"));
1540                 exit(1);
1541         }
1542
1543         TALLOC_FREE(ctdb->tunnels);
1544         if (srvid_init(ctdb, &ctdb->tunnels) != 0) {
1545                 DEBUG(DEBUG_ERR, ("Failed to setup tunnels context\n"));
1546                 exit(1);
1547         }
1548
1549         /* initialize statistics collection */
1550         ctdb_statistics_init(ctdb);
1551
1552         /* force initial recovery for election */
1553         ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
1554
1555         if (ctdb_start_eventd(ctdb) != 0) {
1556                 DEBUG(DEBUG_ERR, ("Failed to start event daemon\n"));
1557                 exit(1);
1558         }
1559
1560         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_INIT);
1561         ret = ctdb_event_script(ctdb, CTDB_EVENT_INIT);
1562         if (ret != 0) {
1563                 ctdb_die(ctdb, "Failed to run init event\n");
1564         }
1565         ctdb_run_notification_script(ctdb, "init");
1566
1567         if (strcmp(ctdb->transport, "tcp") == 0) {
1568                 ret = ctdb_tcp_init(ctdb);
1569         }
1570 #ifdef USE_INFINIBAND
1571         if (strcmp(ctdb->transport, "ib") == 0) {
1572                 ret = ctdb_ibw_init(ctdb);
1573         }
1574 #endif
1575         if (ret != 0) {
1576                 DEBUG(DEBUG_ERR,("Failed to initialise transport '%s'\n", ctdb->transport));
1577                 return -1;
1578         }
1579
1580         if (ctdb->methods == NULL) {
1581                 DEBUG(DEBUG_ALERT,(__location__ " Can not initialize transport. ctdb->methods is NULL\n"));
1582                 ctdb_fatal(ctdb, "transport is unavailable. can not initialize.");
1583         }
1584
1585         /* Initialise the transport.  This sets the node address if it
1586          * was not set via the command-line. */
1587         if (ctdb->methods->initialise(ctdb) != 0) {
1588                 ctdb_fatal(ctdb, "transport failed to initialise");
1589         }
1590
1591         ctdb_set_my_pnn(ctdb);
1592
1593         initialise_node_flags(ctdb);
1594
1595         ret = ctdb_set_public_addresses(ctdb, true);
1596         if (ret == -1) {
1597                 D_ERR("Unable to setup public IP addresses\n");
1598                 exit(1);
1599         }
1600
1601         ctdb_initialise_vnn_map(ctdb);
1602
1603         /* attach to existing databases */
1604         if (ctdb_attach_databases(ctdb) != 0) {
1605                 ctdb_fatal(ctdb, "Failed to attach to databases\n");
1606         }
1607
1608         /* start frozen, then let the first election sort things out */
1609         if (!ctdb_blocking_freeze(ctdb)) {
1610                 ctdb_fatal(ctdb, "Failed to get initial freeze\n");
1611         }
1612
1613         /* now start accepting clients, only can do this once frozen */
1614         fde = tevent_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, TEVENT_FD_READ,
1615                             ctdb_accept_client, ctdb);
1616         if (fde == NULL) {
1617                 ctdb_fatal(ctdb, "Failed to add daemon socket to event loop");
1618         }
1619         tevent_fd_set_auto_close(fde);
1620
1621         /* Start the transport */
1622         if (ctdb->methods->start(ctdb) != 0) {
1623                 DEBUG(DEBUG_ALERT,("transport failed to start!\n"));
1624                 ctdb_fatal(ctdb, "transport failed to start");
1625         }
1626
1627         /* Recovery daemon and timed events are started from the
1628          * callback, only after the setup event completes
1629          * successfully.
1630          */
1631         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SETUP);
1632         ret = ctdb_event_script_callback(ctdb,
1633                                          ctdb,
1634                                          ctdb_setup_event_callback,
1635                                          ctdb,
1636                                          CTDB_EVENT_SETUP,
1637                                          "%s",
1638                                          "");
1639         if (ret != 0) {
1640                 DEBUG(DEBUG_CRIT,("Failed to set up 'setup' event\n"));
1641                 exit(1);
1642         }
1643
1644         lockdown_memory(ctdb->valgrinding);
1645
1646         /* go into a wait loop to allow other nodes to complete */
1647         tevent_loop_wait(ctdb->ev);
1648
1649         DEBUG(DEBUG_CRIT,("event_loop_wait() returned. this should not happen\n"));
1650         exit(1);
1651 }
1652
1653 /*
1654   allocate a packet for use in daemon<->daemon communication
1655  */
1656 struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb,
1657                                                  TALLOC_CTX *mem_ctx,
1658                                                  enum ctdb_operation operation,
1659                                                  size_t length, size_t slength,
1660                                                  const char *type)
1661 {
1662         int size;
1663         struct ctdb_req_header *hdr;
1664
1665         length = MAX(length, slength);
1666         size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
1667
1668         if (ctdb->methods == NULL) {
1669                 DEBUG(DEBUG_INFO,(__location__ " Unable to allocate transport packet for operation %u of length %u. Transport is DOWN.\n",
1670                          operation, (unsigned)length));
1671                 return NULL;
1672         }
1673
1674         hdr = (struct ctdb_req_header *)ctdb->methods->allocate_pkt(mem_ctx, size);
1675         if (hdr == NULL) {
1676                 DEBUG(DEBUG_ERR,("Unable to allocate transport packet for operation %u of length %u\n",
1677                          operation, (unsigned)length));
1678                 return NULL;
1679         }
1680         talloc_set_name_const(hdr, type);
1681         memset(hdr, 0, slength);
1682         hdr->length       = length;
1683         hdr->operation    = operation;
1684         hdr->ctdb_magic   = CTDB_MAGIC;
1685         hdr->ctdb_version = CTDB_PROTOCOL;
1686         hdr->generation   = ctdb->vnn_map->generation;
1687         hdr->srcnode      = ctdb->pnn;
1688
1689         return hdr;
1690 }
1691
1692 struct daemon_control_state {
1693         struct daemon_control_state *next, *prev;
1694         struct ctdb_client *client;
1695         struct ctdb_req_control_old *c;
1696         uint32_t reqid;
1697         struct ctdb_node *node;
1698 };
1699
1700 /*
1701   callback when a control reply comes in
1702  */
1703 static void daemon_control_callback(struct ctdb_context *ctdb,
1704                                     int32_t status, TDB_DATA data,
1705                                     const char *errormsg,
1706                                     void *private_data)
1707 {
1708         struct daemon_control_state *state = talloc_get_type(private_data,
1709                                                              struct daemon_control_state);
1710         struct ctdb_client *client = state->client;
1711         struct ctdb_reply_control_old *r;
1712         size_t len;
1713         int ret;
1714
1715         /* construct a message to send to the client containing the data */
1716         len = offsetof(struct ctdb_reply_control_old, data) + data.dsize;
1717         if (errormsg) {
1718                 len += strlen(errormsg);
1719         }
1720         r = ctdbd_allocate_pkt(ctdb, state, CTDB_REPLY_CONTROL, len,
1721                                struct ctdb_reply_control_old);
1722         CTDB_NO_MEMORY_VOID(ctdb, r);
1723
1724         r->hdr.reqid     = state->reqid;
1725         r->status        = status;
1726         r->datalen       = data.dsize;
1727         r->errorlen = 0;
1728         memcpy(&r->data[0], data.dptr, data.dsize);
1729         if (errormsg) {
1730                 r->errorlen = strlen(errormsg);
1731                 memcpy(&r->data[r->datalen], errormsg, r->errorlen);
1732         }
1733
1734         ret = daemon_queue_send(client, &r->hdr);
1735         if (ret != -1) {
1736                 talloc_free(state);
1737         }
1738 }
1739
1740 /*
1741   fail all pending controls to a disconnected node
1742  */
1743 void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node)
1744 {
1745         struct daemon_control_state *state;
1746         while ((state = node->pending_controls)) {
1747                 DLIST_REMOVE(node->pending_controls, state);
1748                 daemon_control_callback(ctdb, (uint32_t)-1, tdb_null,
1749                                         "node is disconnected", state);
1750         }
1751 }
1752
1753 /*
1754   destroy a daemon_control_state
1755  */
1756 static int daemon_control_destructor(struct daemon_control_state *state)
1757 {
1758         if (state->node) {
1759                 DLIST_REMOVE(state->node->pending_controls, state);
1760         }
1761         return 0;
1762 }
1763
1764 /*
1765   this is called when the ctdb daemon received a ctdb request control
1766   from a local client over the unix domain socket
1767  */
1768 static void daemon_request_control_from_client(struct ctdb_client *client,
1769                                                struct ctdb_req_control_old *c)
1770 {
1771         TDB_DATA data;
1772         int res;
1773         struct daemon_control_state *state;
1774         TALLOC_CTX *tmp_ctx = talloc_new(client);
1775
1776         if (c->hdr.destnode == CTDB_CURRENT_NODE) {
1777                 c->hdr.destnode = client->ctdb->pnn;
1778         }
1779
1780         state = talloc(client, struct daemon_control_state);
1781         CTDB_NO_MEMORY_VOID(client->ctdb, state);
1782
1783         state->client = client;
1784         state->c = talloc_steal(state, c);
1785         state->reqid = c->hdr.reqid;
1786         if (ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
1787                 state->node = client->ctdb->nodes[c->hdr.destnode];
1788                 DLIST_ADD(state->node->pending_controls, state);
1789         } else {
1790                 state->node = NULL;
1791         }
1792
1793         talloc_set_destructor(state, daemon_control_destructor);
1794
1795         if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
1796                 talloc_steal(tmp_ctx, state);
1797         }
1798
1799         data.dptr = &c->data[0];
1800         data.dsize = c->datalen;
1801         res = ctdb_daemon_send_control(client->ctdb, c->hdr.destnode,
1802                                        c->srvid, c->opcode, client->client_id,
1803                                        c->flags,
1804                                        data, daemon_control_callback,
1805                                        state);
1806         if (res != 0) {
1807                 DEBUG(DEBUG_ERR,(__location__ " Failed to send control to remote node %u\n",
1808                          c->hdr.destnode));
1809         }
1810
1811         talloc_free(tmp_ctx);
1812 }
1813
1814 static void daemon_request_tunnel_from_client(struct ctdb_client *client,
1815                                               struct ctdb_req_tunnel_old *c)
1816 {
1817         TDB_DATA data;
1818         int ret;
1819
1820         if (! ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
1821                 DEBUG(DEBUG_ERR, ("Invalid destination 0x%x\n",
1822                                   c->hdr.destnode));
1823                 return;
1824         }
1825
1826         ret = srvid_exists(client->ctdb->tunnels, c->tunnel_id, NULL);
1827         if (ret != 0) {
1828                 DEBUG(DEBUG_ERR,
1829                       ("tunnel id 0x%"PRIx64" not registered, dropping pkt\n",
1830                        c->tunnel_id));
1831                 return;
1832         }
1833
1834         data = (TDB_DATA) {
1835                 .dsize = c->datalen,
1836                 .dptr = &c->data[0],
1837         };
1838
1839         ret = ctdb_daemon_send_tunnel(client->ctdb, c->hdr.destnode,
1840                                       c->tunnel_id, c->flags, data);
1841         if (ret != 0) {
1842                 DEBUG(DEBUG_ERR, ("Failed to set tunnel to remote note %u\n",
1843                                   c->hdr.destnode));
1844         }
1845 }
1846
1847 /*
1848   register a call function
1849 */
1850 int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id,
1851                          ctdb_fn_t fn, int id)
1852 {
1853         struct ctdb_registered_call *call;
1854         struct ctdb_db_context *ctdb_db;
1855
1856         ctdb_db = find_ctdb_db(ctdb, db_id);
1857         if (ctdb_db == NULL) {
1858                 return -1;
1859         }
1860
1861         call = talloc(ctdb_db, struct ctdb_registered_call);
1862         call->fn = fn;
1863         call->id = id;
1864
1865         DLIST_ADD(ctdb_db->calls, call);
1866         return 0;
1867 }
1868
1869
1870
1871 /*
1872   this local messaging handler is ugly, but is needed to prevent
1873   recursion in ctdb_send_message() when the destination node is the
1874   same as the source node
1875  */
1876 struct ctdb_local_message {
1877         struct ctdb_context *ctdb;
1878         uint64_t srvid;
1879         TDB_DATA data;
1880 };
1881
1882 static void ctdb_local_message_trigger(struct tevent_context *ev,
1883                                        struct tevent_timer *te,
1884                                        struct timeval t, void *private_data)
1885 {
1886         struct ctdb_local_message *m = talloc_get_type(
1887                 private_data, struct ctdb_local_message);
1888
1889         srvid_dispatch(m->ctdb->srv, m->srvid, CTDB_SRVID_ALL, m->data);
1890         talloc_free(m);
1891 }
1892
1893 static int ctdb_local_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
1894 {
1895         struct ctdb_local_message *m;
1896         m = talloc(ctdb, struct ctdb_local_message);
1897         CTDB_NO_MEMORY(ctdb, m);
1898
1899         m->ctdb = ctdb;
1900         m->srvid = srvid;
1901         m->data  = data;
1902         m->data.dptr = talloc_memdup(m, m->data.dptr, m->data.dsize);
1903         if (m->data.dptr == NULL) {
1904                 talloc_free(m);
1905                 return -1;
1906         }
1907
1908         /* this needs to be done as an event to prevent recursion */
1909         tevent_add_timer(ctdb->ev, m, timeval_zero(),
1910                          ctdb_local_message_trigger, m);
1911         return 0;
1912 }
1913
1914 /*
1915   send a ctdb message
1916 */
1917 int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn,
1918                              uint64_t srvid, TDB_DATA data)
1919 {
1920         struct ctdb_req_message_old *r;
1921         int len;
1922
1923         if (ctdb->methods == NULL) {
1924                 DEBUG(DEBUG_INFO,(__location__ " Failed to send message. Transport is DOWN\n"));
1925                 return -1;
1926         }
1927
1928         /* see if this is a message to ourselves */
1929         if (pnn == ctdb->pnn) {
1930                 return ctdb_local_message(ctdb, srvid, data);
1931         }
1932
1933         len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
1934         r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_MESSAGE, len,
1935                                     struct ctdb_req_message_old);
1936         CTDB_NO_MEMORY(ctdb, r);
1937
1938         r->hdr.destnode  = pnn;
1939         r->srvid         = srvid;
1940         r->datalen       = data.dsize;
1941         memcpy(&r->data[0], data.dptr, data.dsize);
1942
1943         ctdb_queue_packet(ctdb, &r->hdr);
1944
1945         talloc_free(r);
1946         return 0;
1947 }
1948
1949
1950
1951 struct ctdb_client_notify_list {
1952         struct ctdb_client_notify_list *next, *prev;
1953         struct ctdb_context *ctdb;
1954         uint64_t srvid;
1955         TDB_DATA data;
1956 };
1957
1958
1959 static int ctdb_client_notify_destructor(struct ctdb_client_notify_list *nl)
1960 {
1961         int ret;
1962
1963         DEBUG(DEBUG_ERR,("Sending client notify message for srvid:%llu\n", (unsigned long long)nl->srvid));
1964
1965         ret = ctdb_daemon_send_message(nl->ctdb, CTDB_BROADCAST_CONNECTED, (unsigned long long)nl->srvid, nl->data);
1966         if (ret != 0) {
1967                 DEBUG(DEBUG_ERR,("Failed to send client notify message\n"));
1968         }
1969
1970         return 0;
1971 }
1972
1973 int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
1974 {
1975         struct ctdb_notify_data_old *notify = (struct ctdb_notify_data_old *)indata.dptr;
1976         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
1977         struct ctdb_client_notify_list *nl;
1978
1979         DEBUG(DEBUG_INFO,("Register srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
1980
1981         if (indata.dsize < offsetof(struct ctdb_notify_data_old, notify_data)) {
1982                 DEBUG(DEBUG_ERR,(__location__ " Too little data in control : %d\n", (int)indata.dsize));
1983                 return -1;
1984         }
1985
1986         if (indata.dsize != (notify->len + offsetof(struct ctdb_notify_data_old, notify_data))) {
1987                 DEBUG(DEBUG_ERR,(__location__ " Wrong amount of data in control. Got %d, expected %d\n", (int)indata.dsize, (int)(notify->len + offsetof(struct ctdb_notify_data_old, notify_data))));
1988                 return -1;
1989         }
1990
1991
1992         if (client == NULL) {
1993                 DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
1994                 return -1;
1995         }
1996
1997         for(nl=client->notify; nl; nl=nl->next) {
1998                 if (nl->srvid == notify->srvid) {
1999                         break;
2000                 }
2001         }
2002         if (nl != NULL) {
2003                 DEBUG(DEBUG_ERR,(__location__ " Notification for srvid:%llu already exists for this client\n", (unsigned long long)notify->srvid));
2004                 return -1;
2005         }
2006
2007         nl = talloc(client, struct ctdb_client_notify_list);
2008         CTDB_NO_MEMORY(ctdb, nl);
2009         nl->ctdb       = ctdb;
2010         nl->srvid      = notify->srvid;
2011         nl->data.dsize = notify->len;
2012         nl->data.dptr  = talloc_memdup(nl, notify->notify_data,
2013                                        nl->data.dsize);
2014         CTDB_NO_MEMORY(ctdb, nl->data.dptr);
2015
2016         DLIST_ADD(client->notify, nl);
2017         talloc_set_destructor(nl, ctdb_client_notify_destructor);
2018
2019         return 0;
2020 }
2021
2022 int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
2023 {
2024         uint64_t srvid = *(uint64_t *)indata.dptr;
2025         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2026         struct ctdb_client_notify_list *nl;
2027
2028         DEBUG(DEBUG_INFO,("Deregister srvid %llu for client %d\n", (unsigned long long)srvid, client_id));
2029
2030         if (client == NULL) {
2031                 DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
2032                 return -1;
2033         }
2034
2035         for(nl=client->notify; nl; nl=nl->next) {
2036                 if (nl->srvid == srvid) {
2037                         break;
2038                 }
2039         }
2040         if (nl == NULL) {
2041                 DEBUG(DEBUG_ERR,(__location__ " No notification for srvid:%llu found for this client\n", (unsigned long long)srvid));
2042                 return -1;
2043         }
2044
2045         DLIST_REMOVE(client->notify, nl);
2046         talloc_set_destructor(nl, NULL);
2047         talloc_free(nl);
2048
2049         return 0;
2050 }
2051
2052 struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid)
2053 {
2054         struct ctdb_client_pid_list *client_pid;
2055
2056         for (client_pid = ctdb->client_pids; client_pid; client_pid=client_pid->next) {
2057                 if (client_pid->pid == pid) {
2058                         return client_pid->client;
2059                 }
2060         }
2061         return NULL;
2062 }
2063
2064
2065 /* This control is used by samba when probing if a process (of a samba daemon)
2066    exists on the node.
2067    Samba does this when it needs/wants to check if a subrecord in one of the
2068    databases is still valid, or if it is stale and can be removed.
2069    If the node is in unhealthy or stopped state we just kill of the samba
2070    process holding this sub-record and return to the calling samba that
2071    the process does not exist.
2072    This allows us to forcefully recall subrecords registered by samba processes
2073    on banned and stopped nodes.
2074 */
2075 int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid)
2076 {
2077         struct ctdb_client *client;
2078
2079         client = ctdb_find_client_by_pid(ctdb, pid);
2080         if (client == NULL) {
2081                 return -1;
2082         }
2083
2084         if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) {
2085                 DEBUG(DEBUG_NOTICE,
2086                       ("Killing client with pid:%d on banned/stopped node\n",
2087                        (int)pid));
2088                 talloc_free(client);
2089                 return -1;
2090         }
2091
2092         return kill(pid, 0);
2093 }
2094
2095 int32_t ctdb_control_check_pid_srvid(struct ctdb_context *ctdb,
2096                                      TDB_DATA indata)
2097 {
2098         struct ctdb_client_pid_list *client_pid;
2099         pid_t pid;
2100         uint64_t srvid;
2101         int ret;
2102
2103         pid = *(pid_t *)indata.dptr;
2104         srvid = *(uint64_t *)(indata.dptr + sizeof(pid_t));
2105
2106         for (client_pid = ctdb->client_pids;
2107              client_pid != NULL;
2108              client_pid = client_pid->next) {
2109                 if (client_pid->pid == pid) {
2110                         ret = srvid_exists(ctdb->srv, srvid,
2111                                            client_pid->client);
2112                         if (ret == 0) {
2113                                 return 0;
2114                         }
2115                 }
2116         }
2117
2118         return -1;
2119 }
2120
2121 int ctdb_control_getnodesfile(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
2122 {
2123         struct ctdb_node_map_old *node_map = NULL;
2124
2125         CHECK_CONTROL_DATA_SIZE(0);
2126
2127         node_map = ctdb_read_nodes_file(ctdb, ctdb->nodes_file);
2128         if (node_map == NULL) {
2129                 DEBUG(DEBUG_ERR, ("Failed to read nodes file\n"));
2130                 return -1;
2131         }
2132
2133         outdata->dptr  = (unsigned char *)node_map;
2134         outdata->dsize = talloc_get_size(outdata->dptr);
2135
2136         return 0;
2137 }
2138
2139 void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code)
2140 {
2141         if (ctdb->runstate == CTDB_RUNSTATE_SHUTDOWN) {
2142                 DEBUG(DEBUG_NOTICE,("Already shutting down so will not proceed.\n"));
2143                 return;
2144         }
2145
2146         DEBUG(DEBUG_ERR,("Shutdown sequence commencing.\n"));
2147         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SHUTDOWN);
2148         ctdb_stop_recoverd(ctdb);
2149         ctdb_stop_keepalive(ctdb);
2150         ctdb_stop_monitoring(ctdb);
2151         ctdb_release_all_ips(ctdb);
2152         ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
2153         ctdb_stop_eventd(ctdb);
2154         if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) {
2155                 ctdb->methods->shutdown(ctdb);
2156         }
2157
2158         DEBUG(DEBUG_ERR,("Shutdown sequence complete, exiting.\n"));
2159         exit(exit_code);
2160 }
2161
2162 /* When forking the main daemon and the child process needs to connect
2163  * back to the daemon as a client process, this function can be used
2164  * to change the ctdb context from daemon into client mode.  The child
2165  * process must be created using ctdb_fork() and not fork() -
2166  * ctdb_fork() does some necessary housekeeping.
2167  */
2168 int switch_from_server_to_client(struct ctdb_context *ctdb)
2169 {
2170         int ret;
2171
2172         /* get a new event context */
2173         ctdb->ev = tevent_context_init(ctdb);
2174         if (ctdb->ev == NULL) {
2175                 DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
2176                 exit(1);
2177         }
2178         tevent_loop_allow_nesting(ctdb->ev);
2179
2180         /* Connect to main CTDB daemon */
2181         ret = ctdb_socket_connect(ctdb);
2182         if (ret != 0) {
2183                 DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb client\n"));
2184                 return -1;
2185         }
2186
2187         ctdb->can_send_controls = true;
2188
2189         return 0;
2190 }