ctdb/server/ctdb_daemon.c

   1 /*
   2    ctdb daemon code
   3
   4    Copyright (C) Andrew Tridgell  2006
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/network.h"
  22 #include "system/filesys.h"
  23 #include "system/wait.h"
  24 #include "system/time.h"
  25
  26 #include <talloc.h>
  27 /* Allow use of deprecated function tevent_loop_allow_nesting() */
  28 #define TEVENT_DEPRECATED
  29 #include <tevent.h>
  30 #include <tdb.h>
  31
  32 #include "lib/tdb_wrap/tdb_wrap.h"
  33 #include "lib/util/dlinklist.h"
  34 #include "lib/util/debug.h"
  35 #include "lib/util/time.h"
  36 #include "lib/util/blocking.h"
  37 #include "lib/util/become_daemon.h"
  38
  39 #include "version.h"
  40 #include "ctdb_private.h"
  41 #include "ctdb_client.h"
  42
  43 #include "common/rb_tree.h"
  44 #include "common/reqid.h"
  45 #include "common/system.h"
  46 #include "common/common.h"
  47 #include "common/logging.h"
  48 #include "common/pidfile.h"
  49 #include "common/sock_io.h"
  50
  51 struct ctdb_client_pid_list {
  52         struct ctdb_client_pid_list *next, *prev;
  53         struct ctdb_context *ctdb;
  54         pid_t pid;
  55         struct ctdb_client *client;
  56 };
  57
  58 const char *ctdbd_pidfile = NULL;
  59 static struct pidfile_context *ctdbd_pidfile_ctx = NULL;
  60
  61 static void daemon_incoming_packet(void *, struct ctdb_req_header *);
  62
  63 static pid_t __ctdbd_pid;
  64
  65 static void print_exit_message(void)
  66 {
  67         if (getpid() == __ctdbd_pid) {
  68                 DEBUG(DEBUG_NOTICE,("CTDB daemon shutting down\n"));
  69
  70                 /* Wait a second to allow pending log messages to be flushed */
  71                 sleep(1);
  72         }
  73 }
  74
  75 #ifdef HAVE_GETRUSAGE
  76
  77 struct cpu_check_threshold_data {
  78         unsigned short percent;
  79         struct timeval timeofday;
  80         struct timeval ru_time;
  81 };
  82
  83 static void ctdb_cpu_check_threshold(struct tevent_context *ev,
  84                                      struct tevent_timer *te,
  85                                      struct timeval tv,
  86                                      void *private_data)
  87 {
  88         struct ctdb_context *ctdb = talloc_get_type_abort(
  89                 private_data, struct ctdb_context);
  90         uint32_t interval = 60;
  91
  92         static unsigned short threshold = 0;
  93         static struct cpu_check_threshold_data prev = {
  94                 .percent = 0,
  95                 .timeofday = { .tv_sec = 0 },
  96                 .ru_time = { .tv_sec = 0 },
  97         };
  98
  99         struct rusage usage;
 100         struct cpu_check_threshold_data curr = {
 101                 .percent = 0,
 102         };
 103         int64_t ru_time_diff, timeofday_diff;
 104         bool first;
 105         int ret;
 106
 107         /*
 108          * Cache the threshold so that we don't waste time checking
 109          * the environment variable every time
 110          */
 111         if (threshold == 0) {
 112                 const char *t;
 113
 114                 threshold = 90;
 115
 116                 t = getenv("CTDB_TEST_CPU_USAGE_THRESHOLD");
 117                 if (t != NULL) {
 118                         int th;
 119
 120                         th = atoi(t);
 121                         if (th <= 0 || th > 100) {
 122                                 DBG_WARNING("Failed to parse env var: %s\n", t);
 123                         } else {
 124                                 threshold = th;
 125                         }
 126                 }
 127         }
 128
 129         ret = getrusage(RUSAGE_SELF, &usage);
 130         if (ret != 0) {
 131                 DBG_WARNING("rusage() failed: %d\n", ret);
 132                 goto next;
 133         }
 134
 135         /* Sum the system and user CPU usage */
 136         curr.ru_time = timeval_sum(&usage.ru_utime, &usage.ru_stime);
 137
 138         curr.timeofday = tv;
 139
 140         first = timeval_is_zero(&prev.timeofday);
 141         if (first) {
 142                 /* No previous values recorded so no calculation to do */
 143                 goto done;
 144         }
 145
 146         timeofday_diff = usec_time_diff(&curr.timeofday, &prev.timeofday);
 147         if (timeofday_diff <= 0) {
 148                 /*
 149                  * Time went backwards or didn't progress so no (sane)
 150                  * calculation can be done
 151                  */
 152                 goto done;
 153         }
 154
 155         ru_time_diff = usec_time_diff(&curr.ru_time, &prev.ru_time);
 156
 157         curr.percent = ru_time_diff * 100 / timeofday_diff;
 158
 159         if (curr.percent >= threshold) {
 160                 /* Log only if the utilisation changes */
 161                 if (curr.percent != prev.percent) {
 162                         D_WARNING("WARNING: CPU utilisation %hu%% >= "
 163                                   "threshold (%hu%%)\n",
 164                                   curr.percent,
 165                                   threshold);
 166                 }
 167         } else {
 168                 /* Log if the utilisation falls below the threshold */
 169                 if (prev.percent >= threshold) {
 170                         D_WARNING("WARNING: CPU utilisation %hu%% < "
 171                                   "threshold (%hu%%)\n",
 172                                   curr.percent,
 173                                   threshold);
 174                 }
 175         }
 176
 177 done:
 178         prev = curr;
 179
 180 next:
 181         tevent_add_timer(ctdb->ev, ctdb,
 182                          timeval_current_ofs(interval, 0),
 183                          ctdb_cpu_check_threshold,
 184                          ctdb);
 185 }
 186
 187 static void ctdb_start_cpu_check_threshold(struct ctdb_context *ctdb)
 188 {
 189         tevent_add_timer(ctdb->ev, ctdb,
 190                          timeval_current(),
 191                          ctdb_cpu_check_threshold,
 192                          ctdb);
 193 }
 194 #endif /* HAVE_GETRUSAGE */
 195
 196 static void ctdb_time_tick(struct tevent_context *ev, struct tevent_timer *te,
 197                                   struct timeval t, void *private_data)
 198 {
 199         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
 200
 201         if (getpid() != ctdb->ctdbd_pid) {
 202                 return;
 203         }
 204
 205         tevent_add_timer(ctdb->ev, ctdb,
 206                          timeval_current_ofs(1, 0),
 207                          ctdb_time_tick, ctdb);
 208 }
 209
 210 /* Used to trigger a dummy event once per second, to make
 211  * detection of hangs more reliable.
 212  */
 213 static void ctdb_start_time_tickd(struct ctdb_context *ctdb)
 214 {
 215         tevent_add_timer(ctdb->ev, ctdb,
 216                          timeval_current_ofs(1, 0),
 217                          ctdb_time_tick, ctdb);
 218 }
 219
 220 static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
 221 {
 222         /* start monitoring for connected/disconnected nodes */
 223         ctdb_start_keepalive(ctdb);
 224
 225         /* start periodic update of tcp tickle lists */
 226         ctdb_start_tcp_tickle_update(ctdb);
 227
 228         /* start listening for recovery daemon pings */
 229         ctdb_control_recd_ping(ctdb);
 230
 231         /* start listening to timer ticks */
 232         ctdb_start_time_tickd(ctdb);
 233
 234 #ifdef HAVE_GETRUSAGE
 235         ctdb_start_cpu_check_threshold(ctdb);
 236 #endif /* HAVE_GETRUSAGE */
 237 }
 238
 239 static void ignore_signal(int signum)
 240 {
 241         struct sigaction act;
 242
 243         memset(&act, 0, sizeof(act));
 244
 245         act.sa_handler = SIG_IGN;
 246         sigemptyset(&act.sa_mask);
 247         sigaddset(&act.sa_mask, signum);
 248         sigaction(signum, &act, NULL);
 249 }
 250
 251
 252 /*
 253   send a packet to a client
 254  */
 255 static int daemon_queue_send(struct ctdb_client *client, struct ctdb_req_header *hdr)
 256 {
 257         CTDB_INCREMENT_STAT(client->ctdb, client_packets_sent);
 258         if (hdr->operation == CTDB_REQ_MESSAGE) {
 259                 if (ctdb_queue_length(client->queue) > client->ctdb->tunable.max_queue_depth_drop_msg) {
 260                         DEBUG(DEBUG_ERR,("CTDB_REQ_MESSAGE queue full - killing client connection.\n"));
 261                         talloc_free(client);
 262                         return -1;
 263                 }
 264         }
 265         return ctdb_queue_send(client->queue, (uint8_t *)hdr, hdr->length);
 266 }
 267
 268 /*
 269   message handler for when we are in daemon mode. This redirects the message
 270   to the right client
 271  */
 272 static void daemon_message_handler(uint64_t srvid, TDB_DATA data,
 273                                    void *private_data)
 274 {
 275         struct ctdb_client *client = talloc_get_type(private_data, struct ctdb_client);
 276         struct ctdb_req_message_old *r;
 277         int len;
 278
 279         /* construct a message to send to the client containing the data */
 280         len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
 281         r = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_MESSAGE,
 282                                len, struct ctdb_req_message_old);
 283         CTDB_NO_MEMORY_VOID(client->ctdb, r);
 284
 285         talloc_set_name_const(r, "req_message packet");
 286
 287         r->srvid         = srvid;
 288         r->datalen       = data.dsize;
 289         memcpy(&r->data[0], data.dptr, data.dsize);
 290
 291         daemon_queue_send(client, &r->hdr);
 292
 293         talloc_free(r);
 294 }
 295
 296 /*
 297   this is called when the ctdb daemon received a ctdb request to
 298   set the srvid from the client
 299  */
 300 int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
 301 {
 302         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
 303         int res;
 304         if (client == NULL) {
 305                 DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_register_message_handler\n"));
 306                 return -1;
 307         }
 308         res = srvid_register(ctdb->srv, client, srvid, daemon_message_handler,
 309                              client);
 310         if (res != 0) {
 311                 DEBUG(DEBUG_ERR,(__location__ " Failed to register handler %llu in daemon\n",
 312                          (unsigned long long)srvid));
 313         } else {
 314                 DEBUG(DEBUG_INFO,(__location__ " Registered message handler for srvid=%llu\n",
 315                          (unsigned long long)srvid));
 316         }
 317
 318         return res;
 319 }
 320
 321 /*
 322   this is called when the ctdb daemon received a ctdb request to
 323   remove a srvid from the client
 324  */
 325 int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
 326 {
 327         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
 328         if (client == NULL) {
 329                 DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_deregister_message_handler\n"));
 330                 return -1;
 331         }
 332         return srvid_deregister(ctdb->srv, srvid, client);
 333 }
 334
 335 void daemon_tunnel_handler(uint64_t tunnel_id, TDB_DATA data,
 336                            void *private_data)
 337 {
 338         struct ctdb_client *client =
 339                 talloc_get_type_abort(private_data, struct ctdb_client);
 340         struct ctdb_req_tunnel_old *c, *pkt;
 341         size_t len;
 342
 343         pkt = (struct ctdb_req_tunnel_old *)data.dptr;
 344
 345         len = offsetof(struct ctdb_req_tunnel_old, data) + pkt->datalen;
 346         c = ctdbd_allocate_pkt(client->ctdb, client->ctdb, CTDB_REQ_TUNNEL,
 347                                len, struct ctdb_req_tunnel_old);
 348         if (c == NULL) {
 349                 DEBUG(DEBUG_ERR, ("Memory error in daemon_tunnel_handler\n"));
 350                 return;
 351         }
 352
 353         talloc_set_name_const(c, "req_tunnel packet");
 354
 355         c->tunnel_id = tunnel_id;
 356         c->flags = pkt->flags;
 357         c->datalen = pkt->datalen;
 358         memcpy(c->data, pkt->data, pkt->datalen);
 359
 360         daemon_queue_send(client, &c->hdr);
 361
 362         talloc_free(c);
 363 }
 364
 365 /*
 366   destroy a ctdb_client
 367 */
 368 static int ctdb_client_destructor(struct ctdb_client *client)
 369 {
 370         struct ctdb_db_context *ctdb_db;
 371
 372         ctdb_takeover_client_destructor_hook(client);
 373         reqid_remove(client->ctdb->idr, client->client_id);
 374         client->ctdb->num_clients--;
 375
 376         if (client->num_persistent_updates != 0) {
 377                 DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates));
 378                 client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
 379         }
 380         ctdb_db = find_ctdb_db(client->ctdb, client->db_id);
 381         if (ctdb_db) {
 382                 DEBUG(DEBUG_ERR, (__location__ " client exit while transaction "
 383                                   "commit active. Forcing recovery.\n"));
 384                 client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
 385
 386                 /*
 387                  * trans3 transaction state:
 388                  *
 389                  * The destructor sets the pointer to NULL.
 390                  */
 391                 talloc_free(ctdb_db->persistent_state);
 392         }
 393
 394         return 0;
 395 }
 396
 397
 398 /*
 399   this is called when the ctdb daemon received a ctdb request message
 400   from a local client over the unix domain socket
 401  */
 402 static void daemon_request_message_from_client(struct ctdb_client *client,
 403                                                struct ctdb_req_message_old *c)
 404 {
 405         TDB_DATA data;
 406         int res;
 407
 408         if (c->hdr.destnode == CTDB_CURRENT_NODE) {
 409                 c->hdr.destnode = ctdb_get_pnn(client->ctdb);
 410         }
 411
 412         /* maybe the message is for another client on this node */
 413         if (ctdb_get_pnn(client->ctdb)==c->hdr.destnode) {
 414                 ctdb_request_message(client->ctdb, (struct ctdb_req_header *)c);
 415                 return;
 416         }
 417
 418         /* its for a remote node */
 419         data.dptr = &c->data[0];
 420         data.dsize = c->datalen;
 421         res = ctdb_daemon_send_message(client->ctdb, c->hdr.destnode,
 422                                        c->srvid, data);
 423         if (res != 0) {
 424                 DEBUG(DEBUG_ERR,(__location__ " Failed to send message to remote node %u\n",
 425                          c->hdr.destnode));
 426         }
 427 }
 428
 429
 430 struct daemon_call_state {
 431         struct ctdb_client *client;
 432         uint32_t reqid;
 433         struct ctdb_call *call;
 434         struct timeval start_time;
 435
 436         /* readonly request ? */
 437         uint32_t readonly_fetch;
 438         uint32_t client_callid;
 439 };
 440
 441 /*
 442    complete a call from a client
 443 */
 444 static void daemon_call_from_client_callback(struct ctdb_call_state *state)
 445 {
 446         struct daemon_call_state *dstate = talloc_get_type(state->async.private_data,
 447                                                            struct daemon_call_state);
 448         struct ctdb_reply_call_old *r;
 449         int res;
 450         uint32_t length;
 451         struct ctdb_client *client = dstate->client;
 452         struct ctdb_db_context *ctdb_db = state->ctdb_db;
 453
 454         talloc_steal(client, dstate);
 455         talloc_steal(dstate, dstate->call);
 456
 457         res = ctdb_daemon_call_recv(state, dstate->call);
 458         if (res != 0) {
 459                 DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n"));
 460                 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
 461
 462                 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 1", call_latency, dstate->start_time);
 463                 return;
 464         }
 465
 466         length = offsetof(struct ctdb_reply_call_old, data) + dstate->call->reply_data.dsize;
 467         /* If the client asked for readonly FETCH, we remapped this to
 468            FETCH_WITH_HEADER when calling the daemon. So we must
 469            strip the extra header off the reply data before passing
 470            it back to the client.
 471         */
 472         if (dstate->readonly_fetch
 473         && dstate->client_callid == CTDB_FETCH_FUNC) {
 474                 length -= sizeof(struct ctdb_ltdb_header);
 475         }
 476
 477         r = ctdbd_allocate_pkt(client->ctdb, dstate, CTDB_REPLY_CALL,
 478                                length, struct ctdb_reply_call_old);
 479         if (r == NULL) {
 480                 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n"));
 481                 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
 482                 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 2", call_latency, dstate->start_time);
 483                 return;
 484         }
 485         r->hdr.reqid        = dstate->reqid;
 486         r->status           = dstate->call->status;
 487
 488         if (dstate->readonly_fetch
 489         && dstate->client_callid == CTDB_FETCH_FUNC) {
 490                 /* client only asked for a FETCH so we must strip off
 491                    the extra ctdb_ltdb header
 492                 */
 493                 r->datalen          = dstate->call->reply_data.dsize - sizeof(struct ctdb_ltdb_header);
 494                 memcpy(&r->data[0], dstate->call->reply_data.dptr + sizeof(struct ctdb_ltdb_header), r->datalen);
 495         } else {
 496                 r->datalen          = dstate->call->reply_data.dsize;
 497                 memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen);
 498         }
 499
 500         res = daemon_queue_send(client, &r->hdr);
 501         if (res == -1) {
 502                 /* client is dead - return immediately */
 503                 return;
 504         }
 505         if (res != 0) {
 506                 DEBUG(DEBUG_ERR, (__location__ " Failed to queue packet from daemon to client\n"));
 507         }
 508         CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 3", call_latency, dstate->start_time);
 509         CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
 510         talloc_free(dstate);
 511 }
 512
 513 struct ctdb_daemon_packet_wrap {
 514         struct ctdb_context *ctdb;
 515         uint32_t client_id;
 516 };
 517
 518 /*
 519   a wrapper to catch disconnected clients
 520  */
 521 static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr)
 522 {
 523         struct ctdb_client *client;
 524         struct ctdb_daemon_packet_wrap *w = talloc_get_type(p,
 525                                                             struct ctdb_daemon_packet_wrap);
 526         if (w == NULL) {
 527                 DEBUG(DEBUG_CRIT,(__location__ " Bad packet type '%s'\n", talloc_get_name(p)));
 528                 return;
 529         }
 530
 531         client = reqid_find(w->ctdb->idr, w->client_id, struct ctdb_client);
 532         if (client == NULL) {
 533                 DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
 534                          w->client_id));
 535                 talloc_free(w);
 536                 return;
 537         }
 538         talloc_free(w);
 539
 540         /* process it */
 541         daemon_incoming_packet(client, hdr);
 542 }
 543
 544 struct ctdb_deferred_fetch_call {
 545         struct ctdb_deferred_fetch_call *next, *prev;
 546         struct ctdb_req_call_old *c;
 547         struct ctdb_daemon_packet_wrap *w;
 548 };
 549
 550 struct ctdb_deferred_fetch_queue {
 551         struct ctdb_deferred_fetch_call *deferred_calls;
 552 };
 553
 554 struct ctdb_deferred_requeue {
 555         struct ctdb_deferred_fetch_call *dfc;
 556         struct ctdb_client *client;
 557 };
 558
 559 /* called from a timer event and starts reprocessing the deferred call.*/
 560 static void reprocess_deferred_call(struct tevent_context *ev,
 561                                     struct tevent_timer *te,
 562                                     struct timeval t, void *private_data)
 563 {
 564         struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data;
 565         struct ctdb_client *client = dfr->client;
 566
 567         talloc_steal(client, dfr->dfc->c);
 568         daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c);
 569         talloc_free(dfr);
 570 }
 571
 572 /* the referral context is destroyed either after a timeout or when the initial
 573    fetch-lock has finished.
 574    at this stage, immediately start reprocessing the queued up deferred
 575    calls so they get reprocessed immediately (and since we are dmaster at
 576    this stage, trigger the waiting smbd processes to pick up and acquire the
 577    record right away.
 578 */
 579 static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq)
 580 {
 581
 582         /* need to reprocess the packets from the queue explicitly instead of
 583            just using a normal destructor since we need to
 584            call the clients in the same order as the requests queued up
 585         */
 586         while (dfq->deferred_calls != NULL) {
 587                 struct ctdb_client *client;
 588                 struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls;
 589                 struct ctdb_deferred_requeue *dfr;
 590
 591                 DLIST_REMOVE(dfq->deferred_calls, dfc);
 592
 593                 client = reqid_find(dfc->w->ctdb->idr, dfc->w->client_id, struct ctdb_client);
 594                 if (client == NULL) {
 595                         DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
 596                                  dfc->w->client_id));
 597                         continue;
 598                 }
 599
 600                 /* process it by pushing it back onto the eventloop */
 601                 dfr = talloc(client, struct ctdb_deferred_requeue);
 602                 if (dfr == NULL) {
 603                         DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n"));
 604                         continue;
 605                 }
 606
 607                 dfr->dfc    = talloc_steal(dfr, dfc);
 608                 dfr->client = client;
 609
 610                 tevent_add_timer(dfc->w->ctdb->ev, client, timeval_zero(),
 611                                  reprocess_deferred_call, dfr);
 612         }
 613
 614         return 0;
 615 }
 616
 617 /* insert the new deferral context into the rb tree.
 618    there should never be a pre-existing context here, but check for it
 619    warn and destroy the previous context if there is already a deferral context
 620    for this key.
 621 */
 622 static void *insert_dfq_callback(void *parm, void *data)
 623 {
 624         if (data) {
 625                 DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm));
 626                 talloc_free(data);
 627         }
 628         return parm;
 629 }
 630
 631 /* if the original fetch-lock did not complete within a reasonable time,
 632    free the context and context for all deferred requests to cause them to be
 633    re-inserted into the event system.
 634 */
 635 static void dfq_timeout(struct tevent_context *ev, struct tevent_timer *te,
 636                         struct timeval t, void *private_data)
 637 {
 638         talloc_free(private_data);
 639 }
 640
 641 /* This function is used in the local daemon to register a KEY in a database
 642    for being "fetched"
 643    While the remote fetch is in-flight, any further attempts to re-fetch the
 644    same record will be deferred until the fetch completes.
 645 */
 646 static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
 647 {
 648         uint32_t *k;
 649         struct ctdb_deferred_fetch_queue *dfq;
 650
 651         k = ctdb_key_to_idkey(call, call->key);
 652         if (k == NULL) {
 653                 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
 654                 return -1;
 655         }
 656
 657         dfq  = talloc(call, struct ctdb_deferred_fetch_queue);
 658         if (dfq == NULL) {
 659                 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n"));
 660                 talloc_free(k);
 661                 return -1;
 662         }
 663         dfq->deferred_calls = NULL;
 664
 665         trbt_insertarray32_callback(ctdb_db->deferred_fetch, k[0], &k[0], insert_dfq_callback, dfq);
 666
 667         talloc_set_destructor(dfq, deferred_fetch_queue_destructor);
 668
 669         /* If the fetch hasn't completed in 30 seconds, just tear it all down
 670            and let it try again as the events are reissued */
 671         tevent_add_timer(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0),
 672                          dfq_timeout, dfq);
 673
 674         talloc_free(k);
 675         return 0;
 676 }
 677
 678 /* check if this is a duplicate request to a fetch already in-flight
 679    if it is, make this call deferred to be reprocessed later when
 680    the in-flight fetch completes.
 681 */
 682 static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call_old *c)
 683 {
 684         uint32_t *k;
 685         struct ctdb_deferred_fetch_queue *dfq;
 686         struct ctdb_deferred_fetch_call *dfc;
 687
 688         k = ctdb_key_to_idkey(c, key);
 689         if (k == NULL) {
 690                 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
 691                 return -1;
 692         }
 693
 694         dfq = trbt_lookuparray32(ctdb_db->deferred_fetch, k[0], &k[0]);
 695         if (dfq == NULL) {
 696                 talloc_free(k);
 697                 return -1;
 698         }
 699
 700
 701         talloc_free(k);
 702
 703         dfc = talloc(dfq, struct ctdb_deferred_fetch_call);
 704         if (dfc == NULL) {
 705                 DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n"));
 706                 return -1;
 707         }
 708
 709         dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap);
 710         if (dfc->w == NULL) {
 711                 DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n"));
 712                 talloc_free(dfc);
 713                 return -1;
 714         }
 715
 716         dfc->c = talloc_steal(dfc, c);
 717         dfc->w->ctdb = ctdb_db->ctdb;
 718         dfc->w->client_id = client->client_id;
 719
 720         DLIST_ADD_END(dfq->deferred_calls, dfc);
 721
 722         return 0;
 723 }
 724
 725
 726 /*
 727   this is called when the ctdb daemon received a ctdb request call
 728   from a local client over the unix domain socket
 729  */
 730 static void daemon_request_call_from_client(struct ctdb_client *client,
 731                                             struct ctdb_req_call_old *c)
 732 {
 733         struct ctdb_call_state *state;
 734         struct ctdb_db_context *ctdb_db;
 735         struct daemon_call_state *dstate;
 736         struct ctdb_call *call;
 737         struct ctdb_ltdb_header header;
 738         TDB_DATA key, data;
 739         int ret;
 740         struct ctdb_context *ctdb = client->ctdb;
 741         struct ctdb_daemon_packet_wrap *w;
 742
 743         CTDB_INCREMENT_STAT(ctdb, total_calls);
 744         CTDB_INCREMENT_STAT(ctdb, pending_calls);
 745
 746         ctdb_db = find_ctdb_db(client->ctdb, c->db_id);
 747         if (!ctdb_db) {
 748                 DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x\n",
 749                           c->db_id));
 750                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 751                 return;
 752         }
 753
 754         if (ctdb_db->unhealthy_reason) {
 755                 /*
 756                  * this is just a warning, as the tdb should be empty anyway,
 757                  * and only persistent databases can be unhealthy, which doesn't
 758                  * use this code patch
 759                  */
 760                 DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n",
 761                                      ctdb_db->db_name, ctdb_db->unhealthy_reason));
 762         }
 763
 764         key.dptr = c->data;
 765         key.dsize = c->keylen;
 766
 767         w = talloc(ctdb, struct ctdb_daemon_packet_wrap);
 768         CTDB_NO_MEMORY_VOID(ctdb, w);
 769
 770         w->ctdb = ctdb;
 771         w->client_id = client->client_id;
 772
 773         ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header,
 774                                            (struct ctdb_req_header *)c, &data,
 775                                            daemon_incoming_packet_wrap, w, true);
 776         if (ret == -2) {
 777                 /* will retry later */
 778                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 779                 return;
 780         }
 781
 782         talloc_free(w);
 783
 784         if (ret != 0) {
 785                 DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n"));
 786                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 787                 return;
 788         }
 789
 790
 791         /* check if this fetch request is a duplicate for a
 792            request we already have in flight. If so defer it until
 793            the first request completes.
 794         */
 795         if (ctdb->tunable.fetch_collapse == 1) {
 796                 if (requeue_duplicate_fetch(ctdb_db, client, key, c) == 0) {
 797                         ret = ctdb_ltdb_unlock(ctdb_db, key);
 798                         if (ret != 0) {
 799                                 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 800                         }
 801                         CTDB_DECREMENT_STAT(ctdb, pending_calls);
 802                         talloc_free(data.dptr);
 803                         return;
 804                 }
 805         }
 806
 807         /* Dont do READONLY if we don't have a tracking database */
 808         if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db_readonly(ctdb_db)) {
 809                 c->flags &= ~CTDB_WANT_READONLY;
 810         }
 811
 812         if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
 813                 header.flags &= ~CTDB_REC_RO_FLAGS;
 814                 CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
 815                 CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
 816                 if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
 817                         ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
 818                 }
 819                 /* and clear out the tracking data */
 820                 if (tdb_delete(ctdb_db->rottdb, key) != 0) {
 821                         DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
 822                 }
 823         }
 824
 825         /* if we are revoking, we must defer all other calls until the revoke
 826          * had completed.
 827          */
 828         if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
 829                 talloc_free(data.dptr);
 830                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 831
 832                 if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
 833                         ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
 834                 }
 835                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 836                 return;
 837         }
 838
 839         if ((header.dmaster == ctdb->pnn)
 840         && (!(c->flags & CTDB_WANT_READONLY))
 841         && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
 842                 header.flags   |= CTDB_REC_RO_REVOKING_READONLY;
 843                 if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
 844                         ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
 845                 }
 846                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 847
 848                 if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) {
 849                         ctdb_fatal(ctdb, "Failed to start record revoke");
 850                 }
 851                 talloc_free(data.dptr);
 852
 853                 if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
 854                         ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
 855                 }
 856
 857                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 858                 return;
 859         }
 860
 861         dstate = talloc(client, struct daemon_call_state);
 862         if (dstate == NULL) {
 863                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 864                 if (ret != 0) {
 865                         DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 866                 }
 867
 868                 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n"));
 869                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 870                 return;
 871         }
 872         dstate->start_time = timeval_current();
 873         dstate->client = client;
 874         dstate->reqid  = c->hdr.reqid;
 875         talloc_steal(dstate, data.dptr);
 876
 877         call = dstate->call = talloc_zero(dstate, struct ctdb_call);
 878         if (call == NULL) {
 879                 ret = ctdb_ltdb_unlock(ctdb_db, key);
 880                 if (ret != 0) {
 881                         DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 882                 }
 883
 884                 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n"));
 885                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 886                 CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 1", call_latency, dstate->start_time);
 887                 return;
 888         }
 889
 890         dstate->readonly_fetch = 0;
 891         call->call_id = c->callid;
 892         call->key = key;
 893         call->call_data.dptr = c->data + c->keylen;
 894         call->call_data.dsize = c->calldatalen;
 895         call->flags = c->flags;
 896
 897         if (c->flags & CTDB_WANT_READONLY) {
 898                 /* client wants readonly record, so translate this into a
 899                    fetch with header. remember what the client asked for
 900                    so we can remap the reply back to the proper format for
 901                    the client in the reply
 902                  */
 903                 dstate->client_callid = call->call_id;
 904                 call->call_id = CTDB_FETCH_WITH_HEADER_FUNC;
 905                 dstate->readonly_fetch = 1;
 906         }
 907
 908         if (header.dmaster == ctdb->pnn) {
 909                 state = ctdb_call_local_send(ctdb_db, call, &header, &data);
 910         } else {
 911                 state = ctdb_daemon_call_send_remote(ctdb_db, call, &header);
 912                 if (ctdb->tunable.fetch_collapse == 1) {
 913                         /* This request triggered a remote fetch-lock.
 914                            set up a deferral for this key so any additional
 915                            fetch-locks are deferred until the current one
 916                            finishes.
 917                          */
 918                         setup_deferred_fetch_locks(ctdb_db, call);
 919                 }
 920         }
 921
 922         ret = ctdb_ltdb_unlock(ctdb_db, key);
 923         if (ret != 0) {
 924                 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
 925         }
 926
 927         if (state == NULL) {
 928                 DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n"));
 929                 CTDB_DECREMENT_STAT(ctdb, pending_calls);
 930                 CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 2", call_latency, dstate->start_time);
 931                 return;
 932         }
 933         talloc_steal(state, dstate);
 934         talloc_steal(client, state);
 935
 936         state->async.fn = daemon_call_from_client_callback;
 937         state->async.private_data = dstate;
 938 }
 939
 940
 941 static void daemon_request_control_from_client(struct ctdb_client *client,
 942                                                struct ctdb_req_control_old *c);
 943 static void daemon_request_tunnel_from_client(struct ctdb_client *client,
 944                                               struct ctdb_req_tunnel_old *c);
 945
 946 /* data contains a packet from the client */
 947 static void daemon_incoming_packet(void *p, struct ctdb_req_header *hdr)
 948 {
 949         struct ctdb_client *client = talloc_get_type(p, struct ctdb_client);
 950         TALLOC_CTX *tmp_ctx;
 951         struct ctdb_context *ctdb = client->ctdb;
 952
 953         /* place the packet as a child of a tmp_ctx. We then use
 954            talloc_free() below to free it. If any of the calls want
 955            to keep it, then they will steal it somewhere else, and the
 956            talloc_free() will be a no-op */
 957         tmp_ctx = talloc_new(client);
 958         talloc_steal(tmp_ctx, hdr);
 959
 960         if (hdr->ctdb_magic != CTDB_MAGIC) {
 961                 ctdb_set_error(client->ctdb, "Non CTDB packet rejected in daemon\n");
 962                 goto done;
 963         }
 964
 965         if (hdr->ctdb_version != CTDB_PROTOCOL) {
 966                 ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
 967                 goto done;
 968         }
 969
 970         switch (hdr->operation) {
 971         case CTDB_REQ_CALL:
 972                 CTDB_INCREMENT_STAT(ctdb, client.req_call);
 973                 daemon_request_call_from_client(client, (struct ctdb_req_call_old *)hdr);
 974                 break;
 975
 976         case CTDB_REQ_MESSAGE:
 977                 CTDB_INCREMENT_STAT(ctdb, client.req_message);
 978                 daemon_request_message_from_client(client, (struct ctdb_req_message_old *)hdr);
 979                 break;
 980
 981         case CTDB_REQ_CONTROL:
 982                 CTDB_INCREMENT_STAT(ctdb, client.req_control);
 983                 daemon_request_control_from_client(client, (struct ctdb_req_control_old *)hdr);
 984                 break;
 985
 986         case CTDB_REQ_TUNNEL:
 987                 CTDB_INCREMENT_STAT(ctdb, client.req_tunnel);
 988                 daemon_request_tunnel_from_client(client, (struct ctdb_req_tunnel_old *)hdr);
 989                 break;
 990
 991         default:
 992                 DEBUG(DEBUG_CRIT,(__location__ " daemon: unrecognized operation %u\n",
 993                          hdr->operation));
 994         }
 995
 996 done:
 997         talloc_free(tmp_ctx);
 998 }
 999
1000 /*
1001   called when the daemon gets a incoming packet
1002  */
1003 static void ctdb_daemon_read_cb(uint8_t *data, size_t cnt, void *args)
1004 {
1005         struct ctdb_client *client = talloc_get_type(args, struct ctdb_client);
1006         struct ctdb_req_header *hdr;
1007
1008         if (cnt == 0) {
1009                 talloc_free(client);
1010                 return;
1011         }
1012
1013         CTDB_INCREMENT_STAT(client->ctdb, client_packets_recv);
1014
1015         if (cnt < sizeof(*hdr)) {
1016                 ctdb_set_error(client->ctdb, "Bad packet length %u in daemon\n",
1017                                (unsigned)cnt);
1018                 return;
1019         }
1020         hdr = (struct ctdb_req_header *)data;
1021
1022         if (hdr->ctdb_magic != CTDB_MAGIC) {
1023                 ctdb_set_error(client->ctdb, "Non CTDB packet rejected\n");
1024                 goto err_out;
1025         }
1026
1027         if (hdr->ctdb_version != CTDB_PROTOCOL) {
1028                 ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
1029                 goto err_out;
1030         }
1031
1032         DEBUG(DEBUG_DEBUG,(__location__ " client request %u of type %u length %u from "
1033                  "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
1034                  hdr->srcnode, hdr->destnode));
1035
1036         /* it is the responsibility of the incoming packet function to free 'data' */
1037         daemon_incoming_packet(client, hdr);
1038         return;
1039
1040 err_out:
1041         TALLOC_FREE(data);
1042 }
1043
1044
1045 static int ctdb_clientpid_destructor(struct ctdb_client_pid_list *client_pid)
1046 {
1047         if (client_pid->ctdb->client_pids != NULL) {
1048                 DLIST_REMOVE(client_pid->ctdb->client_pids, client_pid);
1049         }
1050
1051         return 0;
1052 }
1053
1054 static int get_new_client_id(struct reqid_context *idr,
1055                              struct ctdb_client *client,
1056                              uint32_t *out)
1057 {
1058         uint32_t client_id;
1059
1060         client_id = reqid_new(idr, client);
1061         /*
1062          * Some places in the code (e.g. ctdb_control_db_attach(),
1063          * ctdb_control_db_detach()) assign a special meaning to
1064          * client_id 0.  The assumption is that if client_id is 0 then
1065          * the control has come from another daemon.  Therefore, we
1066          * should never return client_id == 0.
1067          */
1068         if (client_id == 0) {
1069                 /*
1070                  * Don't leak ID 0.  This is safe because the ID keeps
1071                  * increasing.  A test will be added to ensure that
1072                  * this doesn't change.
1073                  */
1074                 reqid_remove(idr, 0);
1075
1076                 client_id = reqid_new(idr, client);
1077         }
1078
1079         if (client_id == REQID_INVALID) {
1080                 return EINVAL;
1081         }
1082
1083         if (client_id == 0) {
1084                 /* Every other ID must have been used and we can't use 0 */
1085                 reqid_remove(idr, 0);
1086                 return EINVAL;
1087         }
1088
1089         *out = client_id;
1090         return 0;
1091 }
1092
1093 static void ctdb_accept_client(struct tevent_context *ev,
1094                                struct tevent_fd *fde, uint16_t flags,
1095                                void *private_data)
1096 {
1097         struct sockaddr_un addr;
1098         socklen_t len;
1099         int fd;
1100         struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
1101         struct ctdb_client *client;
1102         struct ctdb_client_pid_list *client_pid;
1103         pid_t peer_pid = 0;
1104         int ret;
1105
1106         memset(&addr, 0, sizeof(addr));
1107         len = sizeof(addr);
1108         fd = accept(ctdb->daemon.sd, (struct sockaddr *)&addr, &len);
1109         if (fd == -1) {
1110                 return;
1111         }
1112         smb_set_close_on_exec(fd);
1113
1114         ret = set_blocking(fd, false);
1115         if (ret != 0) {
1116                 DEBUG(DEBUG_ERR,
1117                       (__location__
1118                        " failed to set socket non-blocking (%s)\n",
1119                        strerror(errno)));
1120                 close(fd);
1121                 return;
1122         }
1123
1124         set_close_on_exec(fd);
1125
1126         DEBUG(DEBUG_DEBUG,(__location__ " Created SOCKET FD:%d to connected child\n", fd));
1127
1128         client = talloc_zero(ctdb, struct ctdb_client);
1129         if (ctdb_get_peer_pid(fd, &peer_pid) == 0) {
1130                 DEBUG(DEBUG_INFO,("Connected client with pid:%u\n", (unsigned)peer_pid));
1131         }
1132
1133         client->ctdb = ctdb;
1134         client->fd = fd;
1135
1136         ret = get_new_client_id(ctdb->idr, client, &client->client_id);
1137         if (ret != 0) {
1138                 DBG_ERR("Unable to get client ID (%d)\n", ret);
1139                 close(fd);
1140                 talloc_free(client);
1141                 return;
1142         }
1143
1144         client->pid = peer_pid;
1145
1146         client_pid = talloc(client, struct ctdb_client_pid_list);
1147         if (client_pid == NULL) {
1148                 DEBUG(DEBUG_ERR,("Failed to allocate client pid structure\n"));
1149                 close(fd);
1150                 talloc_free(client);
1151                 return;
1152         }
1153         client_pid->ctdb   = ctdb;
1154         client_pid->pid    = peer_pid;
1155         client_pid->client = client;
1156
1157         DLIST_ADD(ctdb->client_pids, client_pid);
1158
1159         client->queue = ctdb_queue_setup(ctdb, client, fd, CTDB_DS_ALIGNMENT,
1160                                          ctdb_daemon_read_cb, client,
1161                                          "client-%u", client->pid);
1162
1163         talloc_set_destructor(client, ctdb_client_destructor);
1164         talloc_set_destructor(client_pid, ctdb_clientpid_destructor);
1165         ctdb->num_clients++;
1166 }
1167
1168
1169
1170 /*
1171  * Create a unix domain socket, bind it, secure it and listen.  Return
1172  * the file descriptor for the socket.
1173  */
1174 static int ux_socket_bind(struct ctdb_context *ctdb, bool test_mode_enabled)
1175 {
1176         struct sockaddr_un addr = { .sun_family = AF_UNIX };
1177         int ret;
1178
1179         ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
1180         if (ctdb->daemon.sd == -1) {
1181                 return -1;
1182         }
1183
1184         strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1);
1185
1186         if (! sock_clean(ctdb->daemon.name)) {
1187                 return -1;
1188         }
1189
1190         set_close_on_exec(ctdb->daemon.sd);
1191
1192         ret = set_blocking(ctdb->daemon.sd, false);
1193         if (ret != 0) {
1194                 DBG_ERR("Failed to set socket non-blocking (%s)\n",
1195                         strerror(errno));
1196                 goto failed;
1197         }
1198
1199         ret = bind(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr));
1200         if (ret == -1) {
1201                 D_ERR("Unable to bind on ctdb socket '%s'\n", ctdb->daemon.name);
1202                 goto failed;
1203         }
1204
1205         if (!test_mode_enabled) {
1206                 ret = chown(ctdb->daemon.name, geteuid(), getegid());
1207                 if (ret != 0 && !test_mode_enabled) {
1208                         D_ERR("Unable to secure (chown) ctdb socket '%s'\n",
1209                               ctdb->daemon.name);
1210                         goto failed;
1211                 }
1212         }
1213
1214         ret = chmod(ctdb->daemon.name, 0700);
1215         if (ret != 0) {
1216                 D_ERR("Unable to secure (chmod) ctdb socket '%s'\n",
1217                       ctdb->daemon.name);
1218                 goto failed;
1219         }
1220
1221
1222         ret = listen(ctdb->daemon.sd, 100);
1223         if (ret != 0) {
1224                 D_ERR("Unable to listen on ctdb socket '%s'\n",
1225                       ctdb->daemon.name);
1226                 goto failed;
1227         }
1228
1229         D_NOTICE("Listening to ctdb socket %s\n", ctdb->daemon.name);
1230         return 0;
1231
1232 failed:
1233         close(ctdb->daemon.sd);
1234         ctdb->daemon.sd = -1;
1235         return -1;
1236 }
1237
1238 struct ctdb_node *ctdb_find_node(struct ctdb_context *ctdb, uint32_t pnn)
1239 {
1240         struct ctdb_node *node = NULL;
1241         unsigned int i;
1242
1243         if (pnn == CTDB_CURRENT_NODE) {
1244                 pnn = ctdb->pnn;
1245         }
1246
1247         /* Always found: PNN correctly set just before this is called */
1248         for (i = 0; i < ctdb->num_nodes; i++) {
1249                 node = ctdb->nodes[i];
1250                 if (pnn == node->pnn) {
1251                         return node;
1252                 }
1253         }
1254
1255         return NULL;
1256 }
1257
1258 static void initialise_node_flags (struct ctdb_context *ctdb)
1259 {
1260         struct ctdb_node *node = NULL;
1261
1262         node = ctdb_find_node(ctdb, CTDB_CURRENT_NODE);
1263         /*
1264          * PNN correctly set just before this is called so always
1265          * found but keep static analysers happy...
1266          */
1267         if (node == NULL) {
1268                 DBG_ERR("Unable to find current node\n");
1269                 return;
1270         }
1271
1272         node->flags &= ~NODE_FLAGS_DISCONNECTED;
1273
1274         /* do we start out in DISABLED mode? */
1275         if (ctdb->start_as_disabled != 0) {
1276                 D_ERR("This node is configured to start in DISABLED state\n");
1277                 node->flags |= NODE_FLAGS_PERMANENTLY_DISABLED;
1278         }
1279         /* do we start out in STOPPED mode? */
1280         if (ctdb->start_as_stopped != 0) {
1281                 D_ERR("This node is configured to start in STOPPED state\n");
1282                 node->flags |= NODE_FLAGS_STOPPED;
1283         }
1284 }
1285
1286 static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
1287                                       void *private_data)
1288 {
1289         if (status != 0) {
1290                 ctdb_die(ctdb, "Failed to run setup event");
1291         }
1292         ctdb_run_notification_script(ctdb, "setup");
1293
1294         /* Start the recovery daemon */
1295         if (ctdb_start_recoverd(ctdb) != 0) {
1296                 DEBUG(DEBUG_ALERT,("Failed to start recovery daemon\n"));
1297                 exit(11);
1298         }
1299
1300         ctdb_start_periodic_events(ctdb);
1301
1302         ctdb_wait_for_first_recovery(ctdb);
1303 }
1304
1305 static struct timeval tevent_before_wait_ts;
1306 static struct timeval tevent_after_wait_ts;
1307
1308 static void ctdb_tevent_trace_init(void)
1309 {
1310         struct timeval now;
1311
1312         now = timeval_current();
1313
1314         tevent_before_wait_ts = now;
1315         tevent_after_wait_ts = now;
1316 }
1317
1318 static void ctdb_tevent_trace(enum tevent_trace_point tp,
1319                               void *private_data)
1320 {
1321         struct timeval diff;
1322         struct timeval now;
1323         struct ctdb_context *ctdb =
1324                 talloc_get_type(private_data, struct ctdb_context);
1325
1326         if (getpid() != ctdb->ctdbd_pid) {
1327                 return;
1328         }
1329
1330         now = timeval_current();
1331
1332         switch (tp) {
1333         case TEVENT_TRACE_BEFORE_WAIT:
1334                 diff = timeval_until(&tevent_after_wait_ts, &now);
1335                 if (diff.tv_sec > 3) {
1336                         DEBUG(DEBUG_ERR,
1337                               ("Handling event took %ld seconds!\n",
1338                                (long)diff.tv_sec));
1339                 }
1340                 tevent_before_wait_ts = now;
1341                 break;
1342
1343         case TEVENT_TRACE_AFTER_WAIT:
1344                 diff = timeval_until(&tevent_before_wait_ts, &now);
1345                 if (diff.tv_sec > 3) {
1346                         DEBUG(DEBUG_ERR,
1347                               ("No event for %ld seconds!\n",
1348                                (long)diff.tv_sec));
1349                 }
1350                 tevent_after_wait_ts = now;
1351                 break;
1352
1353         default:
1354                 /* Do nothing for future tevent trace points */ ;
1355         }
1356 }
1357
1358 static void ctdb_remove_pidfile(void)
1359 {
1360         TALLOC_FREE(ctdbd_pidfile_ctx);
1361 }
1362
1363 static void ctdb_create_pidfile(TALLOC_CTX *mem_ctx)
1364 {
1365         if (ctdbd_pidfile != NULL) {
1366                 int ret = pidfile_context_create(mem_ctx, ctdbd_pidfile,
1367                                                  &ctdbd_pidfile_ctx);
1368                 if (ret != 0) {
1369                         DEBUG(DEBUG_ERR,
1370                               ("Failed to create PID file %s\n",
1371                                ctdbd_pidfile));
1372                         exit(11);
1373                 }
1374
1375                 DEBUG(DEBUG_NOTICE, ("Created PID file %s\n", ctdbd_pidfile));
1376                 atexit(ctdb_remove_pidfile);
1377         }
1378 }
1379
1380 static void ctdb_initialise_vnn_map(struct ctdb_context *ctdb)
1381 {
1382         unsigned int i, j, count;
1383
1384         /* initialize the vnn mapping table, skipping any deleted nodes */
1385         ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
1386         CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map);
1387
1388         count = 0;
1389         for (i = 0; i < ctdb->num_nodes; i++) {
1390                 if ((ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) == 0) {
1391                         count++;
1392                 }
1393         }
1394
1395         ctdb->vnn_map->generation = INVALID_GENERATION;
1396         ctdb->vnn_map->size = count;
1397         ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, ctdb->vnn_map->size);
1398         CTDB_NO_MEMORY_FATAL(ctdb, ctdb->vnn_map->map);
1399
1400         for(i=0, j=0; i < ctdb->vnn_map->size; i++) {
1401                 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1402                         continue;
1403                 }
1404                 ctdb->vnn_map->map[j] = i;
1405                 j++;
1406         }
1407 }
1408
1409 static void ctdb_set_my_pnn(struct ctdb_context *ctdb)
1410 {
1411         if (ctdb->address == NULL) {
1412                 ctdb_fatal(ctdb,
1413                            "Can not determine PNN - node address is not set\n");
1414         }
1415
1416         ctdb->pnn = ctdb_ip_to_pnn(ctdb, ctdb->address);
1417         if (ctdb->pnn == CTDB_UNKNOWN_PNN) {
1418                 ctdb_fatal(ctdb,
1419                            "Can not determine PNN - unknown node address\n");
1420         }
1421
1422         D_NOTICE("PNN is %u\n", ctdb->pnn);
1423 }
1424
1425 static void stdin_handler(struct tevent_context *ev,
1426                           struct tevent_fd *fde,
1427                           uint16_t flags,
1428                           void *private_data)
1429 {
1430         struct ctdb_context *ctdb = talloc_get_type_abort(
1431                 private_data, struct ctdb_context);
1432         ssize_t nread;
1433         char c;
1434
1435         nread = read(STDIN_FILENO, &c, 1);
1436         if (nread != 1) {
1437                 D_ERR("stdin closed, exiting\n");
1438                 talloc_free(fde);
1439                 ctdb_shutdown_sequence(ctdb, EPIPE);
1440         }
1441 }
1442
1443 static int setup_stdin_handler(struct ctdb_context *ctdb)
1444 {
1445         struct tevent_fd *fde;
1446         struct stat st;
1447         int ret;
1448
1449         ret = fstat(STDIN_FILENO, &st);
1450         if (ret != 0) {
1451                 /* Problem with stdin, ignore... */
1452                 DBG_INFO("Can't fstat() stdin\n");
1453                 return 0;
1454         }
1455
1456         if (!S_ISFIFO(st.st_mode)) {
1457                 DBG_INFO("Not a pipe...\n");
1458                 return 0;
1459         }
1460
1461         fde = tevent_add_fd(ctdb->ev,
1462                             ctdb,
1463                             STDIN_FILENO,
1464                             TEVENT_FD_READ,
1465                             stdin_handler,
1466                             ctdb);
1467         if (fde == NULL) {
1468                 return ENOMEM;
1469         }
1470
1471         DBG_INFO("Set up stdin handler\n");
1472         return 0;
1473 }
1474
1475 static void fork_only(void)
1476 {
1477         pid_t pid;
1478
1479         pid = fork();
1480         if (pid == -1) {
1481                 D_ERR("Fork failed (errno=%d)\n", errno);
1482                 exit(1);
1483         }
1484
1485         if (pid != 0) {
1486                 /* Parent simply exits... */
1487                 exit(0);
1488         }
1489 }
1490
1491 static void sighup_hook(void *private_data)
1492 {
1493         struct ctdb_context *ctdb = talloc_get_type_abort(private_data,
1494                                                           struct ctdb_context);
1495
1496         if (ctdb->recoverd_pid > 0) {
1497                 kill(ctdb->recoverd_pid, SIGHUP);
1498         }
1499         ctdb_event_reopen_logs(ctdb);
1500 }
1501
1502 /*
1503   start the protocol going as a daemon
1504 */
1505 int ctdb_start_daemon(struct ctdb_context *ctdb,
1506                       bool interactive,
1507                       bool test_mode_enabled)
1508 {
1509         bool status;
1510         int ret;
1511         struct tevent_fd *fde;
1512
1513         /* Fork if not interactive */
1514         if (!interactive) {
1515                 if (test_mode_enabled) {
1516                         /* Keep stdin open */
1517                         fork_only();
1518                 } else {
1519                         /* Fork, close stdin, start a session */
1520                         become_daemon(true, false, false);
1521                 }
1522         }
1523
1524         ignore_signal(SIGPIPE);
1525         ignore_signal(SIGUSR1);
1526
1527         ctdb->ctdbd_pid = getpid();
1528         DEBUG(DEBUG_ERR, ("Starting CTDBD (Version %s) as PID: %u\n",
1529                           SAMBA_VERSION_STRING, ctdb->ctdbd_pid));
1530         ctdb_create_pidfile(ctdb);
1531
1532         /* create a unix domain stream socket to listen to */
1533         ret = ux_socket_bind(ctdb, test_mode_enabled);
1534         if (ret != 0) {
1535                 D_ERR("Cannot continue.  Exiting!\n");
1536                 exit(10);
1537         }
1538
1539         /* Make sure we log something when the daemon terminates.
1540          * This must be the first exit handler to run (so the last to
1541          * be registered.
1542          */
1543         __ctdbd_pid = getpid();
1544         atexit(print_exit_message);
1545
1546         if (ctdb->do_setsched) {
1547                 /* try to set us up as realtime */
1548                 if (!set_scheduler()) {
1549                         exit(1);
1550                 }
1551                 DEBUG(DEBUG_NOTICE, ("Set real-time scheduler priority\n"));
1552         }
1553
1554         ctdb->ev = tevent_context_init(NULL);
1555         if (ctdb->ev == NULL) {
1556                 DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
1557                 exit(1);
1558         }
1559         tevent_loop_allow_nesting(ctdb->ev);
1560         ctdb_tevent_trace_init();
1561         tevent_set_trace_callback(ctdb->ev, ctdb_tevent_trace, ctdb);
1562
1563         status = logging_setup_sighup_handler(ctdb->ev,
1564                                               ctdb,
1565                                               sighup_hook,
1566                                               ctdb);
1567         if (!status) {
1568                 D_ERR("Failed to set up signal handler for SIGHUP\n");
1569                 exit(1);
1570         }
1571
1572         /* set up a handler to pick up sigchld */
1573         if (ctdb_init_sigchld(ctdb) == NULL) {
1574                 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
1575                 exit(1);
1576         }
1577
1578         if (!interactive) {
1579                 ctdb_set_child_logging(ctdb);
1580         }
1581
1582         /* Exit if stdin is closed */
1583         if (test_mode_enabled) {
1584                 ret = setup_stdin_handler(ctdb);
1585                 if (ret != 0) {
1586                         DBG_ERR("Failed to setup stdin handler\n");
1587                         exit(1);
1588                 }
1589         }
1590
1591         TALLOC_FREE(ctdb->srv);
1592         if (srvid_init(ctdb, &ctdb->srv) != 0) {
1593                 DEBUG(DEBUG_CRIT,("Failed to setup message srvid context\n"));
1594                 exit(1);
1595         }
1596
1597         TALLOC_FREE(ctdb->tunnels);
1598         if (srvid_init(ctdb, &ctdb->tunnels) != 0) {
1599                 DEBUG(DEBUG_ERR, ("Failed to setup tunnels context\n"));
1600                 exit(1);
1601         }
1602
1603         /* initialize statistics collection */
1604         ctdb_statistics_init(ctdb);
1605
1606         /* force initial recovery for election */
1607         ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
1608
1609         if (ctdb_start_eventd(ctdb) != 0) {
1610                 DEBUG(DEBUG_ERR, ("Failed to start event daemon\n"));
1611                 exit(1);
1612         }
1613
1614         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_INIT);
1615         ret = ctdb_event_script(ctdb, CTDB_EVENT_INIT);
1616         if (ret != 0) {
1617                 ctdb_die(ctdb, "Failed to run init event\n");
1618         }
1619         ctdb_run_notification_script(ctdb, "init");
1620
1621         if (strcmp(ctdb->transport, "tcp") == 0) {
1622                 ret = ctdb_tcp_init(ctdb);
1623         }
1624 #ifdef USE_INFINIBAND
1625         if (strcmp(ctdb->transport, "ib") == 0) {
1626                 ret = ctdb_ibw_init(ctdb);
1627         }
1628 #endif
1629         if (ret != 0) {
1630                 DEBUG(DEBUG_ERR,("Failed to initialise transport '%s'\n", ctdb->transport));
1631                 return -1;
1632         }
1633
1634         if (ctdb->methods == NULL) {
1635                 DEBUG(DEBUG_ALERT,(__location__ " Can not initialize transport. ctdb->methods is NULL\n"));
1636                 ctdb_fatal(ctdb, "transport is unavailable. can not initialize.");
1637         }
1638
1639         /* Initialise the transport.  This sets the node address if it
1640          * was not set via the command-line. */
1641         if (ctdb->methods->initialise(ctdb) != 0) {
1642                 ctdb_fatal(ctdb, "transport failed to initialise");
1643         }
1644
1645         ctdb_set_my_pnn(ctdb);
1646
1647         initialise_node_flags(ctdb);
1648
1649         ret = ctdb_set_public_addresses(ctdb, true);
1650         if (ret == -1) {
1651                 D_ERR("Unable to setup public IP addresses\n");
1652                 exit(1);
1653         }
1654
1655         ctdb_initialise_vnn_map(ctdb);
1656
1657         /* attach to existing databases */
1658         if (ctdb_attach_databases(ctdb) != 0) {
1659                 ctdb_fatal(ctdb, "Failed to attach to databases\n");
1660         }
1661
1662         /* start frozen, then let the first election sort things out */
1663         if (!ctdb_blocking_freeze(ctdb)) {
1664                 ctdb_fatal(ctdb, "Failed to get initial freeze\n");
1665         }
1666
1667         /* now start accepting clients, only can do this once frozen */
1668         fde = tevent_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, TEVENT_FD_READ,
1669                             ctdb_accept_client, ctdb);
1670         if (fde == NULL) {
1671                 ctdb_fatal(ctdb, "Failed to add daemon socket to event loop");
1672         }
1673         tevent_fd_set_auto_close(fde);
1674
1675         /* Start the transport */
1676         if (ctdb->methods->start(ctdb) != 0) {
1677                 DEBUG(DEBUG_ALERT,("transport failed to start!\n"));
1678                 ctdb_fatal(ctdb, "transport failed to start");
1679         }
1680
1681         /* Recovery daemon and timed events are started from the
1682          * callback, only after the setup event completes
1683          * successfully.
1684          */
1685         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SETUP);
1686         ret = ctdb_event_script_callback(ctdb,
1687                                          ctdb,
1688                                          ctdb_setup_event_callback,
1689                                          ctdb,
1690                                          CTDB_EVENT_SETUP,
1691                                          "%s",
1692                                          "");
1693         if (ret != 0) {
1694                 DEBUG(DEBUG_CRIT,("Failed to set up 'setup' event\n"));
1695                 exit(1);
1696         }
1697
1698         lockdown_memory(ctdb->valgrinding);
1699
1700         /* go into a wait loop to allow other nodes to complete */
1701         tevent_loop_wait(ctdb->ev);
1702
1703         DEBUG(DEBUG_CRIT,("event_loop_wait() returned. this should not happen\n"));
1704         exit(1);
1705 }
1706
1707 /*
1708   allocate a packet for use in daemon<->daemon communication
1709  */
1710 struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb,
1711                                                  TALLOC_CTX *mem_ctx,
1712                                                  enum ctdb_operation operation,
1713                                                  size_t length, size_t slength,
1714                                                  const char *type)
1715 {
1716         int size;
1717         struct ctdb_req_header *hdr;
1718
1719         length = MAX(length, slength);
1720         size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
1721
1722         if (ctdb->methods == NULL) {
1723                 DEBUG(DEBUG_INFO,(__location__ " Unable to allocate transport packet for operation %u of length %u. Transport is DOWN.\n",
1724                          operation, (unsigned)length));
1725                 return NULL;
1726         }
1727
1728         hdr = (struct ctdb_req_header *)ctdb->methods->allocate_pkt(mem_ctx, size);
1729         if (hdr == NULL) {
1730                 DEBUG(DEBUG_ERR,("Unable to allocate transport packet for operation %u of length %u\n",
1731                          operation, (unsigned)length));
1732                 return NULL;
1733         }
1734         talloc_set_name_const(hdr, type);
1735         memset(hdr, 0, slength);
1736         hdr->length       = length;
1737         hdr->operation    = operation;
1738         hdr->ctdb_magic   = CTDB_MAGIC;
1739         hdr->ctdb_version = CTDB_PROTOCOL;
1740         hdr->generation   = ctdb->vnn_map->generation;
1741         hdr->srcnode      = ctdb->pnn;
1742
1743         return hdr;
1744 }
1745
1746 struct daemon_control_state {
1747         struct daemon_control_state *next, *prev;
1748         struct ctdb_client *client;
1749         struct ctdb_req_control_old *c;
1750         uint32_t reqid;
1751         struct ctdb_node *node;
1752 };
1753
1754 /*
1755   callback when a control reply comes in
1756  */
1757 static void daemon_control_callback(struct ctdb_context *ctdb,
1758                                     int32_t status, TDB_DATA data,
1759                                     const char *errormsg,
1760                                     void *private_data)
1761 {
1762         struct daemon_control_state *state = talloc_get_type(private_data,
1763                                                              struct daemon_control_state);
1764         struct ctdb_client *client = state->client;
1765         struct ctdb_reply_control_old *r;
1766         size_t len;
1767         int ret;
1768
1769         /* construct a message to send to the client containing the data */
1770         len = offsetof(struct ctdb_reply_control_old, data) + data.dsize;
1771         if (errormsg) {
1772                 len += strlen(errormsg);
1773         }
1774         r = ctdbd_allocate_pkt(ctdb, state, CTDB_REPLY_CONTROL, len,
1775                                struct ctdb_reply_control_old);
1776         CTDB_NO_MEMORY_VOID(ctdb, r);
1777
1778         r->hdr.reqid     = state->reqid;
1779         r->status        = status;
1780         r->datalen       = data.dsize;
1781         r->errorlen = 0;
1782         memcpy(&r->data[0], data.dptr, data.dsize);
1783         if (errormsg) {
1784                 r->errorlen = strlen(errormsg);
1785                 memcpy(&r->data[r->datalen], errormsg, r->errorlen);
1786         }
1787
1788         ret = daemon_queue_send(client, &r->hdr);
1789         if (ret != -1) {
1790                 talloc_free(state);
1791         }
1792 }
1793
1794 /*
1795   fail all pending controls to a disconnected node
1796  */
1797 void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node)
1798 {
1799         struct daemon_control_state *state;
1800         while ((state = node->pending_controls)) {
1801                 DLIST_REMOVE(node->pending_controls, state);
1802                 daemon_control_callback(ctdb, (uint32_t)-1, tdb_null,
1803                                         "node is disconnected", state);
1804         }
1805 }
1806
1807 /*
1808   destroy a daemon_control_state
1809  */
1810 static int daemon_control_destructor(struct daemon_control_state *state)
1811 {
1812         if (state->node) {
1813                 DLIST_REMOVE(state->node->pending_controls, state);
1814         }
1815         return 0;
1816 }
1817
1818 /*
1819   this is called when the ctdb daemon received a ctdb request control
1820   from a local client over the unix domain socket
1821  */
1822 static void daemon_request_control_from_client(struct ctdb_client *client,
1823                                                struct ctdb_req_control_old *c)
1824 {
1825         TDB_DATA data;
1826         int res;
1827         struct daemon_control_state *state;
1828         TALLOC_CTX *tmp_ctx = talloc_new(client);
1829
1830         if (c->hdr.destnode == CTDB_CURRENT_NODE) {
1831                 c->hdr.destnode = client->ctdb->pnn;
1832         }
1833
1834         state = talloc(client, struct daemon_control_state);
1835         CTDB_NO_MEMORY_VOID(client->ctdb, state);
1836
1837         state->client = client;
1838         state->c = talloc_steal(state, c);
1839         state->reqid = c->hdr.reqid;
1840         if (ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
1841                 state->node = client->ctdb->nodes[c->hdr.destnode];
1842                 DLIST_ADD(state->node->pending_controls, state);
1843         } else {
1844                 state->node = NULL;
1845         }
1846
1847         talloc_set_destructor(state, daemon_control_destructor);
1848
1849         if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
1850                 talloc_steal(tmp_ctx, state);
1851         }
1852
1853         data.dptr = &c->data[0];
1854         data.dsize = c->datalen;
1855         res = ctdb_daemon_send_control(client->ctdb, c->hdr.destnode,
1856                                        c->srvid, c->opcode, client->client_id,
1857                                        c->flags,
1858                                        data, daemon_control_callback,
1859                                        state);
1860         if (res != 0) {
1861                 DEBUG(DEBUG_ERR,(__location__ " Failed to send control to remote node %u\n",
1862                          c->hdr.destnode));
1863         }
1864
1865         talloc_free(tmp_ctx);
1866 }
1867
1868 static void daemon_request_tunnel_from_client(struct ctdb_client *client,
1869                                               struct ctdb_req_tunnel_old *c)
1870 {
1871         TDB_DATA data;
1872         int ret;
1873
1874         if (! ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
1875                 DEBUG(DEBUG_ERR, ("Invalid destination 0x%x\n",
1876                                   c->hdr.destnode));
1877                 return;
1878         }
1879
1880         ret = srvid_exists(client->ctdb->tunnels, c->tunnel_id, NULL);
1881         if (ret != 0) {
1882                 DEBUG(DEBUG_ERR,
1883                       ("tunnel id 0x%"PRIx64" not registered, dropping pkt\n",
1884                        c->tunnel_id));
1885                 return;
1886         }
1887
1888         data = (TDB_DATA) {
1889                 .dsize = c->datalen,
1890                 .dptr = &c->data[0],
1891         };
1892
1893         ret = ctdb_daemon_send_tunnel(client->ctdb, c->hdr.destnode,
1894                                       c->tunnel_id, c->flags, data);
1895         if (ret != 0) {
1896                 DEBUG(DEBUG_ERR, ("Failed to set tunnel to remote note %u\n",
1897                                   c->hdr.destnode));
1898         }
1899 }
1900
1901 /*
1902   register a call function
1903 */
1904 int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id,
1905                          ctdb_fn_t fn, int id)
1906 {
1907         struct ctdb_registered_call *call;
1908         struct ctdb_db_context *ctdb_db;
1909
1910         ctdb_db = find_ctdb_db(ctdb, db_id);
1911         if (ctdb_db == NULL) {
1912                 return -1;
1913         }
1914
1915         call = talloc(ctdb_db, struct ctdb_registered_call);
1916         call->fn = fn;
1917         call->id = id;
1918
1919         DLIST_ADD(ctdb_db->calls, call);
1920         return 0;
1921 }
1922
1923
1924
1925 /*
1926   this local messaging handler is ugly, but is needed to prevent
1927   recursion in ctdb_send_message() when the destination node is the
1928   same as the source node
1929  */
1930 struct ctdb_local_message {
1931         struct ctdb_context *ctdb;
1932         uint64_t srvid;
1933         TDB_DATA data;
1934 };
1935
1936 static void ctdb_local_message_trigger(struct tevent_context *ev,
1937                                        struct tevent_timer *te,
1938                                        struct timeval t, void *private_data)
1939 {
1940         struct ctdb_local_message *m = talloc_get_type(
1941                 private_data, struct ctdb_local_message);
1942
1943         srvid_dispatch(m->ctdb->srv, m->srvid, CTDB_SRVID_ALL, m->data);
1944         talloc_free(m);
1945 }
1946
1947 static int ctdb_local_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
1948 {
1949         struct ctdb_local_message *m;
1950         m = talloc(ctdb, struct ctdb_local_message);
1951         CTDB_NO_MEMORY(ctdb, m);
1952
1953         m->ctdb = ctdb;
1954         m->srvid = srvid;
1955         m->data  = data;
1956         m->data.dptr = talloc_memdup(m, m->data.dptr, m->data.dsize);
1957         if (m->data.dptr == NULL) {
1958                 talloc_free(m);
1959                 return -1;
1960         }
1961
1962         /* this needs to be done as an event to prevent recursion */
1963         tevent_add_timer(ctdb->ev, m, timeval_zero(),
1964                          ctdb_local_message_trigger, m);
1965         return 0;
1966 }
1967
1968 /*
1969   send a ctdb message
1970 */
1971 int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn,
1972                              uint64_t srvid, TDB_DATA data)
1973 {
1974         struct ctdb_req_message_old *r;
1975         int len;
1976
1977         if (ctdb->methods == NULL) {
1978                 DEBUG(DEBUG_INFO,(__location__ " Failed to send message. Transport is DOWN\n"));
1979                 return -1;
1980         }
1981
1982         /* see if this is a message to ourselves */
1983         if (pnn == ctdb->pnn) {
1984                 return ctdb_local_message(ctdb, srvid, data);
1985         }
1986
1987         len = offsetof(struct ctdb_req_message_old, data) + data.dsize;
1988         r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_MESSAGE, len,
1989                                     struct ctdb_req_message_old);
1990         CTDB_NO_MEMORY(ctdb, r);
1991
1992         r->hdr.destnode  = pnn;
1993         r->srvid         = srvid;
1994         r->datalen       = data.dsize;
1995         memcpy(&r->data[0], data.dptr, data.dsize);
1996
1997         ctdb_queue_packet(ctdb, &r->hdr);
1998
1999         talloc_free(r);
2000         return 0;
2001 }
2002
2003
2004
2005 struct ctdb_client_notify_list {
2006         struct ctdb_client_notify_list *next, *prev;
2007         struct ctdb_context *ctdb;
2008         uint64_t srvid;
2009         TDB_DATA data;
2010 };
2011
2012
2013 static int ctdb_client_notify_destructor(struct ctdb_client_notify_list *nl)
2014 {
2015         int ret;
2016
2017         DEBUG(DEBUG_ERR,("Sending client notify message for srvid:%llu\n", (unsigned long long)nl->srvid));
2018
2019         ret = ctdb_daemon_send_message(nl->ctdb, CTDB_BROADCAST_CONNECTED, (unsigned long long)nl->srvid, nl->data);
2020         if (ret != 0) {
2021                 DEBUG(DEBUG_ERR,("Failed to send client notify message\n"));
2022         }
2023
2024         return 0;
2025 }
2026
2027 int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
2028 {
2029         struct ctdb_notify_data_old *notify = (struct ctdb_notify_data_old *)indata.dptr;
2030         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2031         struct ctdb_client_notify_list *nl;
2032
2033         DEBUG(DEBUG_INFO,("Register srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
2034
2035         if (indata.dsize < offsetof(struct ctdb_notify_data_old, notify_data)) {
2036                 DEBUG(DEBUG_ERR,(__location__ " Too little data in control : %d\n", (int)indata.dsize));
2037                 return -1;
2038         }
2039
2040         if (indata.dsize != (notify->len + offsetof(struct ctdb_notify_data_old, notify_data))) {
2041                 DEBUG(DEBUG_ERR,(__location__ " Wrong amount of data in control. Got %d, expected %d\n", (int)indata.dsize, (int)(notify->len + offsetof(struct ctdb_notify_data_old, notify_data))));
2042                 return -1;
2043         }
2044
2045
2046         if (client == NULL) {
2047                 DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
2048                 return -1;
2049         }
2050
2051         for(nl=client->notify; nl; nl=nl->next) {
2052                 if (nl->srvid == notify->srvid) {
2053                         break;
2054                 }
2055         }
2056         if (nl != NULL) {
2057                 DEBUG(DEBUG_ERR,(__location__ " Notification for srvid:%llu already exists for this client\n", (unsigned long long)notify->srvid));
2058                 return -1;
2059         }
2060
2061         nl = talloc(client, struct ctdb_client_notify_list);
2062         CTDB_NO_MEMORY(ctdb, nl);
2063         nl->ctdb       = ctdb;
2064         nl->srvid      = notify->srvid;
2065         nl->data.dsize = notify->len;
2066         nl->data.dptr  = talloc_memdup(nl, notify->notify_data,
2067                                        nl->data.dsize);
2068         CTDB_NO_MEMORY(ctdb, nl->data.dptr);
2069
2070         DLIST_ADD(client->notify, nl);
2071         talloc_set_destructor(nl, ctdb_client_notify_destructor);
2072
2073         return 0;
2074 }
2075
2076 int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
2077 {
2078         uint64_t srvid = *(uint64_t *)indata.dptr;
2079         struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2080         struct ctdb_client_notify_list *nl;
2081
2082         DEBUG(DEBUG_INFO,("Deregister srvid %llu for client %d\n", (unsigned long long)srvid, client_id));
2083
2084         if (client == NULL) {
2085                 DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
2086                 return -1;
2087         }
2088
2089         for(nl=client->notify; nl; nl=nl->next) {
2090                 if (nl->srvid == srvid) {
2091                         break;
2092                 }
2093         }
2094         if (nl == NULL) {
2095                 DEBUG(DEBUG_ERR,(__location__ " No notification for srvid:%llu found for this client\n", (unsigned long long)srvid));
2096                 return -1;
2097         }
2098
2099         DLIST_REMOVE(client->notify, nl);
2100         talloc_set_destructor(nl, NULL);
2101         talloc_free(nl);
2102
2103         return 0;
2104 }
2105
2106 struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid)
2107 {
2108         struct ctdb_client_pid_list *client_pid;
2109
2110         for (client_pid = ctdb->client_pids; client_pid; client_pid=client_pid->next) {
2111                 if (client_pid->pid == pid) {
2112                         return client_pid->client;
2113                 }
2114         }
2115         return NULL;
2116 }
2117
2118
2119 /* This control is used by samba when probing if a process (of a samba daemon)
2120    exists on the node.
2121    Samba does this when it needs/wants to check if a subrecord in one of the
2122    databases is still valid, or if it is stale and can be removed.
2123    If the node is in unhealthy or stopped state we just kill of the samba
2124    process holding this sub-record and return to the calling samba that
2125    the process does not exist.
2126    This allows us to forcefully recall subrecords registered by samba processes
2127    on banned and stopped nodes.
2128 */
2129 int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid)
2130 {
2131         struct ctdb_client *client;
2132
2133         client = ctdb_find_client_by_pid(ctdb, pid);
2134         if (client == NULL) {
2135                 return -1;
2136         }
2137
2138         if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_INACTIVE) {
2139                 DEBUG(DEBUG_NOTICE,
2140                       ("Killing client with pid:%d on banned/stopped node\n",
2141                        (int)pid));
2142                 talloc_free(client);
2143                 return -1;
2144         }
2145
2146         return kill(pid, 0);
2147 }
2148
2149 int32_t ctdb_control_check_pid_srvid(struct ctdb_context *ctdb,
2150                                      TDB_DATA indata)
2151 {
2152         struct ctdb_client_pid_list *client_pid;
2153         pid_t pid;
2154         uint64_t srvid;
2155         int ret;
2156
2157         pid = *(pid_t *)indata.dptr;
2158         srvid = *(uint64_t *)(indata.dptr + sizeof(pid_t));
2159
2160         for (client_pid = ctdb->client_pids;
2161              client_pid != NULL;
2162              client_pid = client_pid->next) {
2163                 if (client_pid->pid == pid) {
2164                         ret = srvid_exists(ctdb->srv, srvid,
2165                                            client_pid->client);
2166                         if (ret == 0) {
2167                                 return 0;
2168                         }
2169                 }
2170         }
2171
2172         return -1;
2173 }
2174
2175 int ctdb_control_getnodesfile(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
2176 {
2177         struct ctdb_node_map_old *node_map = NULL;
2178
2179         CHECK_CONTROL_DATA_SIZE(0);
2180
2181         node_map = ctdb_read_nodes_file(ctdb, ctdb->nodes_file);
2182         if (node_map == NULL) {
2183                 DEBUG(DEBUG_ERR, ("Failed to read nodes file\n"));
2184                 return -1;
2185         }
2186
2187         outdata->dptr  = (unsigned char *)node_map;
2188         outdata->dsize = talloc_get_size(outdata->dptr);
2189
2190         return 0;
2191 }
2192
2193 void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code)
2194 {
2195         if (ctdb->runstate == CTDB_RUNSTATE_SHUTDOWN) {
2196                 DEBUG(DEBUG_NOTICE,("Already shutting down so will not proceed.\n"));
2197                 return;
2198         }
2199
2200         DEBUG(DEBUG_ERR,("Shutdown sequence commencing.\n"));
2201         ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SHUTDOWN);
2202         ctdb_stop_recoverd(ctdb);
2203         ctdb_stop_keepalive(ctdb);
2204         ctdb_stop_monitoring(ctdb);
2205         ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
2206         ctdb_stop_eventd(ctdb);
2207         if (ctdb->methods != NULL && ctdb->methods->shutdown != NULL) {
2208                 ctdb->methods->shutdown(ctdb);
2209         }
2210
2211         DEBUG(DEBUG_ERR,("Shutdown sequence complete, exiting.\n"));
2212         exit(exit_code);
2213 }
2214
2215 /* When forking the main daemon and the child process needs to connect
2216  * back to the daemon as a client process, this function can be used
2217  * to change the ctdb context from daemon into client mode.  The child
2218  * process must be created using ctdb_fork() and not fork() -
2219  * ctdb_fork() does some necessary housekeeping.
2220  */
2221 int switch_from_server_to_client(struct ctdb_context *ctdb)
2222 {
2223         int ret;
2224
2225         if (ctdb->daemon.sd != -1) {
2226                 close(ctdb->daemon.sd);
2227                 ctdb->daemon.sd = -1;
2228         }
2229
2230         /* get a new event context */
2231         ctdb->ev = tevent_context_init(ctdb);
2232         if (ctdb->ev == NULL) {
2233                 DEBUG(DEBUG_ALERT,("tevent_context_init() failed\n"));
2234                 exit(1);
2235         }
2236         tevent_loop_allow_nesting(ctdb->ev);
2237
2238         /* Connect to main CTDB daemon */
2239         ret = ctdb_socket_connect(ctdb);
2240         if (ret != 0) {
2241                 DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb client\n"));
2242                 return -1;
2243         }
2244
2245         ctdb->can_send_controls = true;
2246
2247         return 0;
2248 }