param: allow dump_globals to check the actual defaults
[Samba.git] / ctdb / server / ctdb_daemon.c
blobda2f42e9a97db44e9b740acb7fd027224987cfc7
1 /*
2 ctdb daemon code
4 Copyright (C) Andrew Tridgell 2006
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
20 #include "includes.h"
21 #include "db_wrap.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_version.h"
28 #include "../include/ctdb_client.h"
29 #include "../include/ctdb_private.h"
30 #include "../common/rb_tree.h"
31 #include <sys/socket.h>
33 struct ctdb_client_pid_list {
34 struct ctdb_client_pid_list *next, *prev;
35 struct ctdb_context *ctdb;
36 pid_t pid;
37 struct ctdb_client *client;
40 const char *ctdbd_pidfile = NULL;
42 static void daemon_incoming_packet(void *, struct ctdb_req_header *);
44 static void print_exit_message(void)
46 if (debug_extra != NULL && debug_extra[0] != '\0') {
47 DEBUG(DEBUG_NOTICE,("CTDB %s shutting down\n", debug_extra));
48 } else {
49 DEBUG(DEBUG_NOTICE,("CTDB daemon shutting down\n"));
51 /* Wait a second to allow pending log messages to be flushed */
52 sleep(1);
58 static void ctdb_time_tick(struct event_context *ev, struct timed_event *te,
59 struct timeval t, void *private_data)
61 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
63 if (getpid() != ctdb->ctdbd_pid) {
64 return;
67 event_add_timed(ctdb->ev, ctdb,
68 timeval_current_ofs(1, 0),
69 ctdb_time_tick, ctdb);
72 /* Used to trigger a dummy event once per second, to make
73 * detection of hangs more reliable.
75 static void ctdb_start_time_tickd(struct ctdb_context *ctdb)
77 event_add_timed(ctdb->ev, ctdb,
78 timeval_current_ofs(1, 0),
79 ctdb_time_tick, ctdb);
82 static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
84 /* start monitoring for connected/disconnected nodes */
85 ctdb_start_keepalive(ctdb);
87 /* start periodic update of tcp tickle lists */
88 ctdb_start_tcp_tickle_update(ctdb);
90 /* start listening for recovery daemon pings */
91 ctdb_control_recd_ping(ctdb);
93 /* start listening to timer ticks */
94 ctdb_start_time_tickd(ctdb);
97 static void ignore_signal(int signum)
99 struct sigaction act;
101 memset(&act, 0, sizeof(act));
103 act.sa_handler = SIG_IGN;
104 sigemptyset(&act.sa_mask);
105 sigaddset(&act.sa_mask, signum);
106 sigaction(signum, &act, NULL);
111 send a packet to a client
113 static int daemon_queue_send(struct ctdb_client *client, struct ctdb_req_header *hdr)
115 CTDB_INCREMENT_STAT(client->ctdb, client_packets_sent);
116 if (hdr->operation == CTDB_REQ_MESSAGE) {
117 if (ctdb_queue_length(client->queue) > client->ctdb->tunable.max_queue_depth_drop_msg) {
118 DEBUG(DEBUG_ERR,("CTDB_REQ_MESSAGE queue full - killing client connection.\n"));
119 talloc_free(client);
120 return -1;
123 return ctdb_queue_send(client->queue, (uint8_t *)hdr, hdr->length);
127 message handler for when we are in daemon mode. This redirects the message
128 to the right client
130 static void daemon_message_handler(struct ctdb_context *ctdb, uint64_t srvid,
131 TDB_DATA data, void *private_data)
133 struct ctdb_client *client = talloc_get_type(private_data, struct ctdb_client);
134 struct ctdb_req_message *r;
135 int len;
137 /* construct a message to send to the client containing the data */
138 len = offsetof(struct ctdb_req_message, data) + data.dsize;
139 r = ctdbd_allocate_pkt(ctdb, ctdb, CTDB_REQ_MESSAGE,
140 len, struct ctdb_req_message);
141 CTDB_NO_MEMORY_VOID(ctdb, r);
143 talloc_set_name_const(r, "req_message packet");
145 r->srvid = srvid;
146 r->datalen = data.dsize;
147 memcpy(&r->data[0], data.dptr, data.dsize);
149 daemon_queue_send(client, &r->hdr);
151 talloc_free(r);
155 this is called when the ctdb daemon received a ctdb request to
156 set the srvid from the client
158 int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
160 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
161 int res;
162 if (client == NULL) {
163 DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_register_message_handler\n"));
164 return -1;
166 res = ctdb_register_message_handler(ctdb, client, srvid, daemon_message_handler, client);
167 if (res != 0) {
168 DEBUG(DEBUG_ERR,(__location__ " Failed to register handler %llu in daemon\n",
169 (unsigned long long)srvid));
170 } else {
171 DEBUG(DEBUG_INFO,(__location__ " Registered message handler for srvid=%llu\n",
172 (unsigned long long)srvid));
175 return res;
179 this is called when the ctdb daemon received a ctdb request to
180 remove a srvid from the client
182 int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
184 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
185 if (client == NULL) {
186 DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_deregister_message_handler\n"));
187 return -1;
189 return ctdb_deregister_message_handler(ctdb, srvid, client);
192 int daemon_check_srvids(struct ctdb_context *ctdb, TDB_DATA indata,
193 TDB_DATA *outdata)
195 uint64_t *ids;
196 int i, num_ids;
197 uint8_t *results;
199 if ((indata.dsize % sizeof(uint64_t)) != 0) {
200 DEBUG(DEBUG_ERR, ("Bad indata in daemon_check_srvids, "
201 "size=%d\n", (int)indata.dsize));
202 return -1;
205 ids = (uint64_t *)indata.dptr;
206 num_ids = indata.dsize / 8;
208 results = talloc_zero_array(outdata, uint8_t, (num_ids+7)/8);
209 if (results == NULL) {
210 DEBUG(DEBUG_ERR, ("talloc failed in daemon_check_srvids\n"));
211 return -1;
213 for (i=0; i<num_ids; i++) {
214 if (ctdb_check_message_handler(ctdb, ids[i])) {
215 results[i/8] |= (1 << (i%8));
218 outdata->dptr = (uint8_t *)results;
219 outdata->dsize = talloc_get_size(results);
220 return 0;
224 destroy a ctdb_client
226 static int ctdb_client_destructor(struct ctdb_client *client)
228 struct ctdb_db_context *ctdb_db;
230 ctdb_takeover_client_destructor_hook(client);
231 ctdb_reqid_remove(client->ctdb, client->client_id);
232 client->ctdb->num_clients--;
234 if (client->num_persistent_updates != 0) {
235 DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates));
236 client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
238 ctdb_db = find_ctdb_db(client->ctdb, client->db_id);
239 if (ctdb_db) {
240 DEBUG(DEBUG_ERR, (__location__ " client exit while transaction "
241 "commit active. Forcing recovery.\n"));
242 client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
245 * trans3 transaction state:
247 * The destructor sets the pointer to NULL.
249 talloc_free(ctdb_db->persistent_state);
252 return 0;
257 this is called when the ctdb daemon received a ctdb request message
258 from a local client over the unix domain socket
260 static void daemon_request_message_from_client(struct ctdb_client *client,
261 struct ctdb_req_message *c)
263 TDB_DATA data;
264 int res;
266 if (c->hdr.destnode == CTDB_CURRENT_NODE) {
267 c->hdr.destnode = ctdb_get_pnn(client->ctdb);
270 /* maybe the message is for another client on this node */
271 if (ctdb_get_pnn(client->ctdb)==c->hdr.destnode) {
272 ctdb_request_message(client->ctdb, (struct ctdb_req_header *)c);
273 return;
276 /* its for a remote node */
277 data.dptr = &c->data[0];
278 data.dsize = c->datalen;
279 res = ctdb_daemon_send_message(client->ctdb, c->hdr.destnode,
280 c->srvid, data);
281 if (res != 0) {
282 DEBUG(DEBUG_ERR,(__location__ " Failed to send message to remote node %u\n",
283 c->hdr.destnode));
288 struct daemon_call_state {
289 struct ctdb_client *client;
290 uint32_t reqid;
291 struct ctdb_call *call;
292 struct timeval start_time;
294 /* readonly request ? */
295 uint32_t readonly_fetch;
296 uint32_t client_callid;
300 complete a call from a client
302 static void daemon_call_from_client_callback(struct ctdb_call_state *state)
304 struct daemon_call_state *dstate = talloc_get_type(state->async.private_data,
305 struct daemon_call_state);
306 struct ctdb_reply_call *r;
307 int res;
308 uint32_t length;
309 struct ctdb_client *client = dstate->client;
310 struct ctdb_db_context *ctdb_db = state->ctdb_db;
312 talloc_steal(client, dstate);
313 talloc_steal(dstate, dstate->call);
315 res = ctdb_daemon_call_recv(state, dstate->call);
316 if (res != 0) {
317 DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n"));
318 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
320 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 1", call_latency, dstate->start_time);
321 return;
324 length = offsetof(struct ctdb_reply_call, data) + dstate->call->reply_data.dsize;
325 /* If the client asked for readonly FETCH, we remapped this to
326 FETCH_WITH_HEADER when calling the daemon. So we must
327 strip the extra header off the reply data before passing
328 it back to the client.
330 if (dstate->readonly_fetch
331 && dstate->client_callid == CTDB_FETCH_FUNC) {
332 length -= sizeof(struct ctdb_ltdb_header);
335 r = ctdbd_allocate_pkt(client->ctdb, dstate, CTDB_REPLY_CALL,
336 length, struct ctdb_reply_call);
337 if (r == NULL) {
338 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n"));
339 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
340 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 2", call_latency, dstate->start_time);
341 return;
343 r->hdr.reqid = dstate->reqid;
344 r->status = dstate->call->status;
346 if (dstate->readonly_fetch
347 && dstate->client_callid == CTDB_FETCH_FUNC) {
348 /* client only asked for a FETCH so we must strip off
349 the extra ctdb_ltdb header
351 r->datalen = dstate->call->reply_data.dsize - sizeof(struct ctdb_ltdb_header);
352 memcpy(&r->data[0], dstate->call->reply_data.dptr + sizeof(struct ctdb_ltdb_header), r->datalen);
353 } else {
354 r->datalen = dstate->call->reply_data.dsize;
355 memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen);
358 res = daemon_queue_send(client, &r->hdr);
359 if (res == -1) {
360 /* client is dead - return immediately */
361 return;
363 if (res != 0) {
364 DEBUG(DEBUG_ERR, (__location__ " Failed to queue packet from daemon to client\n"));
366 CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 3", call_latency, dstate->start_time);
367 CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
368 talloc_free(dstate);
371 struct ctdb_daemon_packet_wrap {
372 struct ctdb_context *ctdb;
373 uint32_t client_id;
377 a wrapper to catch disconnected clients
379 static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr)
381 struct ctdb_client *client;
382 struct ctdb_daemon_packet_wrap *w = talloc_get_type(p,
383 struct ctdb_daemon_packet_wrap);
384 if (w == NULL) {
385 DEBUG(DEBUG_CRIT,(__location__ " Bad packet type '%s'\n", talloc_get_name(p)));
386 return;
389 client = ctdb_reqid_find(w->ctdb, w->client_id, struct ctdb_client);
390 if (client == NULL) {
391 DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
392 w->client_id));
393 talloc_free(w);
394 return;
396 talloc_free(w);
398 /* process it */
399 daemon_incoming_packet(client, hdr);
402 struct ctdb_deferred_fetch_call {
403 struct ctdb_deferred_fetch_call *next, *prev;
404 struct ctdb_req_call *c;
405 struct ctdb_daemon_packet_wrap *w;
408 struct ctdb_deferred_fetch_queue {
409 struct ctdb_deferred_fetch_call *deferred_calls;
412 struct ctdb_deferred_requeue {
413 struct ctdb_deferred_fetch_call *dfc;
414 struct ctdb_client *client;
417 /* called from a timer event and starts reprocessing the deferred call.*/
418 static void reprocess_deferred_call(struct event_context *ev, struct timed_event *te,
419 struct timeval t, void *private_data)
421 struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data;
422 struct ctdb_client *client = dfr->client;
424 talloc_steal(client, dfr->dfc->c);
425 daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c);
426 talloc_free(dfr);
429 /* the referral context is destroyed either after a timeout or when the initial
430 fetch-lock has finished.
431 at this stage, immediately start reprocessing the queued up deferred
432 calls so they get reprocessed immediately (and since we are dmaster at
433 this stage, trigger the waiting smbd processes to pick up and aquire the
434 record right away.
436 static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq)
439 /* need to reprocess the packets from the queue explicitely instead of
440 just using a normal destructor since we want, need, to
441 call the clients in the same oder as the requests queued up
443 while (dfq->deferred_calls != NULL) {
444 struct ctdb_client *client;
445 struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls;
446 struct ctdb_deferred_requeue *dfr;
448 DLIST_REMOVE(dfq->deferred_calls, dfc);
450 client = ctdb_reqid_find(dfc->w->ctdb, dfc->w->client_id, struct ctdb_client);
451 if (client == NULL) {
452 DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
453 dfc->w->client_id));
454 continue;
457 /* process it by pushing it back onto the eventloop */
458 dfr = talloc(client, struct ctdb_deferred_requeue);
459 if (dfr == NULL) {
460 DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n"));
461 continue;
464 dfr->dfc = talloc_steal(dfr, dfc);
465 dfr->client = client;
467 event_add_timed(dfc->w->ctdb->ev, client, timeval_zero(), reprocess_deferred_call, dfr);
470 return 0;
473 /* insert the new deferral context into the rb tree.
474 there should never be a pre-existing context here, but check for it
475 warn and destroy the previous context if there is already a deferral context
476 for this key.
478 static void *insert_dfq_callback(void *parm, void *data)
480 if (data) {
481 DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm));
482 talloc_free(data);
484 return parm;
487 /* if the original fetch-lock did not complete within a reasonable time,
488 free the context and context for all deferred requests to cause them to be
489 re-inserted into the event system.
491 static void dfq_timeout(struct event_context *ev, struct timed_event *te,
492 struct timeval t, void *private_data)
494 talloc_free(private_data);
497 /* This function is used in the local daemon to register a KEY in a database
498 for being "fetched"
499 While the remote fetch is in-flight, any futher attempts to re-fetch the
500 same record will be deferred until the fetch completes.
502 static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
504 uint32_t *k;
505 struct ctdb_deferred_fetch_queue *dfq;
507 k = talloc_zero_size(call, ((call->key.dsize + 3) & 0xfffffffc) + 4);
508 if (k == NULL) {
509 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
510 return -1;
513 k[0] = (call->key.dsize + 3) / 4 + 1;
514 memcpy(&k[1], call->key.dptr, call->key.dsize);
516 dfq = talloc(call, struct ctdb_deferred_fetch_queue);
517 if (dfq == NULL) {
518 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n"));
519 talloc_free(k);
520 return -1;
522 dfq->deferred_calls = NULL;
524 trbt_insertarray32_callback(ctdb_db->deferred_fetch, k[0], &k[0], insert_dfq_callback, dfq);
526 talloc_set_destructor(dfq, deferred_fetch_queue_destructor);
528 /* if the fetch havent completed in 30 seconds, just tear it all down
529 and let it try again as the events are reissued */
530 event_add_timed(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0), dfq_timeout, dfq);
532 talloc_free(k);
533 return 0;
536 /* check if this is a duplicate request to a fetch already in-flight
537 if it is, make this call deferred to be reprocessed later when
538 the in-flight fetch completes.
540 static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call *c)
542 uint32_t *k;
543 struct ctdb_deferred_fetch_queue *dfq;
544 struct ctdb_deferred_fetch_call *dfc;
546 k = talloc_zero_size(c, ((key.dsize + 3) & 0xfffffffc) + 4);
547 if (k == NULL) {
548 DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
549 return -1;
552 k[0] = (key.dsize + 3) / 4 + 1;
553 memcpy(&k[1], key.dptr, key.dsize);
555 dfq = trbt_lookuparray32(ctdb_db->deferred_fetch, k[0], &k[0]);
556 if (dfq == NULL) {
557 talloc_free(k);
558 return -1;
562 talloc_free(k);
564 dfc = talloc(dfq, struct ctdb_deferred_fetch_call);
565 if (dfc == NULL) {
566 DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n"));
567 return -1;
570 dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap);
571 if (dfc->w == NULL) {
572 DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n"));
573 talloc_free(dfc);
574 return -1;
577 dfc->c = talloc_steal(dfc, c);
578 dfc->w->ctdb = ctdb_db->ctdb;
579 dfc->w->client_id = client->client_id;
581 DLIST_ADD_END(dfq->deferred_calls, dfc, NULL);
583 return 0;
588 this is called when the ctdb daemon received a ctdb request call
589 from a local client over the unix domain socket
591 static void daemon_request_call_from_client(struct ctdb_client *client,
592 struct ctdb_req_call *c)
594 struct ctdb_call_state *state;
595 struct ctdb_db_context *ctdb_db;
596 struct daemon_call_state *dstate;
597 struct ctdb_call *call;
598 struct ctdb_ltdb_header header;
599 TDB_DATA key, data;
600 int ret;
601 struct ctdb_context *ctdb = client->ctdb;
602 struct ctdb_daemon_packet_wrap *w;
604 CTDB_INCREMENT_STAT(ctdb, total_calls);
605 CTDB_DECREMENT_STAT(ctdb, pending_calls);
607 ctdb_db = find_ctdb_db(client->ctdb, c->db_id);
608 if (!ctdb_db) {
609 DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x",
610 c->db_id));
611 CTDB_DECREMENT_STAT(ctdb, pending_calls);
612 return;
615 if (ctdb_db->unhealthy_reason) {
617 * this is just a warning, as the tdb should be empty anyway,
618 * and only persistent databases can be unhealthy, which doesn't
619 * use this code patch
621 DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n",
622 ctdb_db->db_name, ctdb_db->unhealthy_reason));
625 key.dptr = c->data;
626 key.dsize = c->keylen;
628 w = talloc(ctdb, struct ctdb_daemon_packet_wrap);
629 CTDB_NO_MEMORY_VOID(ctdb, w);
631 w->ctdb = ctdb;
632 w->client_id = client->client_id;
634 ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header,
635 (struct ctdb_req_header *)c, &data,
636 daemon_incoming_packet_wrap, w, true);
637 if (ret == -2) {
638 /* will retry later */
639 CTDB_DECREMENT_STAT(ctdb, pending_calls);
640 return;
643 talloc_free(w);
645 if (ret != 0) {
646 DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n"));
647 CTDB_DECREMENT_STAT(ctdb, pending_calls);
648 return;
652 /* check if this fetch request is a duplicate for a
653 request we already have in flight. If so defer it until
654 the first request completes.
656 if (ctdb->tunable.fetch_collapse == 1) {
657 if (requeue_duplicate_fetch(ctdb_db, client, key, c) == 0) {
658 ret = ctdb_ltdb_unlock(ctdb_db, key);
659 if (ret != 0) {
660 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
662 return;
666 /* Dont do READONLY if we dont have a tracking database */
667 if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db->readonly) {
668 c->flags &= ~CTDB_WANT_READONLY;
671 if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
672 header.flags &= ~CTDB_REC_RO_FLAGS;
673 CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
674 CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
675 if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
676 ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
678 /* and clear out the tracking data */
679 if (tdb_delete(ctdb_db->rottdb, key) != 0) {
680 DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
684 /* if we are revoking, we must defer all other calls until the revoke
685 * had completed.
687 if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
688 talloc_free(data.dptr);
689 ret = ctdb_ltdb_unlock(ctdb_db, key);
691 if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
692 ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
694 return;
697 if ((header.dmaster == ctdb->pnn)
698 && (!(c->flags & CTDB_WANT_READONLY))
699 && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
700 header.flags |= CTDB_REC_RO_REVOKING_READONLY;
701 if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
702 ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
704 ret = ctdb_ltdb_unlock(ctdb_db, key);
706 if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) {
707 ctdb_fatal(ctdb, "Failed to start record revoke");
709 talloc_free(data.dptr);
711 if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
712 ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
715 return;
718 dstate = talloc(client, struct daemon_call_state);
719 if (dstate == NULL) {
720 ret = ctdb_ltdb_unlock(ctdb_db, key);
721 if (ret != 0) {
722 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
725 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n"));
726 CTDB_DECREMENT_STAT(ctdb, pending_calls);
727 return;
729 dstate->start_time = timeval_current();
730 dstate->client = client;
731 dstate->reqid = c->hdr.reqid;
732 talloc_steal(dstate, data.dptr);
734 call = dstate->call = talloc_zero(dstate, struct ctdb_call);
735 if (call == NULL) {
736 ret = ctdb_ltdb_unlock(ctdb_db, key);
737 if (ret != 0) {
738 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
741 DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n"));
742 CTDB_DECREMENT_STAT(ctdb, pending_calls);
743 CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 1", call_latency, dstate->start_time);
744 return;
747 dstate->readonly_fetch = 0;
748 call->call_id = c->callid;
749 call->key = key;
750 call->call_data.dptr = c->data + c->keylen;
751 call->call_data.dsize = c->calldatalen;
752 call->flags = c->flags;
754 if (c->flags & CTDB_WANT_READONLY) {
755 /* client wants readonly record, so translate this into a
756 fetch with header. remember what the client asked for
757 so we can remap the reply back to the proper format for
758 the client in the reply
760 dstate->client_callid = call->call_id;
761 call->call_id = CTDB_FETCH_WITH_HEADER_FUNC;
762 dstate->readonly_fetch = 1;
765 if (header.dmaster == ctdb->pnn) {
766 state = ctdb_call_local_send(ctdb_db, call, &header, &data);
767 } else {
768 state = ctdb_daemon_call_send_remote(ctdb_db, call, &header);
769 if (ctdb->tunable.fetch_collapse == 1) {
770 /* This request triggered a remote fetch-lock.
771 set up a deferral for this key so any additional
772 fetch-locks are deferred until the current one
773 finishes.
775 setup_deferred_fetch_locks(ctdb_db, call);
779 ret = ctdb_ltdb_unlock(ctdb_db, key);
780 if (ret != 0) {
781 DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
784 if (state == NULL) {
785 DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n"));
786 CTDB_DECREMENT_STAT(ctdb, pending_calls);
787 CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 2", call_latency, dstate->start_time);
788 return;
790 talloc_steal(state, dstate);
791 talloc_steal(client, state);
793 state->async.fn = daemon_call_from_client_callback;
794 state->async.private_data = dstate;
798 static void daemon_request_control_from_client(struct ctdb_client *client,
799 struct ctdb_req_control *c);
801 /* data contains a packet from the client */
802 static void daemon_incoming_packet(void *p, struct ctdb_req_header *hdr)
804 struct ctdb_client *client = talloc_get_type(p, struct ctdb_client);
805 TALLOC_CTX *tmp_ctx;
806 struct ctdb_context *ctdb = client->ctdb;
808 /* place the packet as a child of a tmp_ctx. We then use
809 talloc_free() below to free it. If any of the calls want
810 to keep it, then they will steal it somewhere else, and the
811 talloc_free() will be a no-op */
812 tmp_ctx = talloc_new(client);
813 talloc_steal(tmp_ctx, hdr);
815 if (hdr->ctdb_magic != CTDB_MAGIC) {
816 ctdb_set_error(client->ctdb, "Non CTDB packet rejected in daemon\n");
817 goto done;
820 if (hdr->ctdb_version != CTDB_VERSION) {
821 ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
822 goto done;
825 switch (hdr->operation) {
826 case CTDB_REQ_CALL:
827 CTDB_INCREMENT_STAT(ctdb, client.req_call);
828 daemon_request_call_from_client(client, (struct ctdb_req_call *)hdr);
829 break;
831 case CTDB_REQ_MESSAGE:
832 CTDB_INCREMENT_STAT(ctdb, client.req_message);
833 daemon_request_message_from_client(client, (struct ctdb_req_message *)hdr);
834 break;
836 case CTDB_REQ_CONTROL:
837 CTDB_INCREMENT_STAT(ctdb, client.req_control);
838 daemon_request_control_from_client(client, (struct ctdb_req_control *)hdr);
839 break;
841 default:
842 DEBUG(DEBUG_CRIT,(__location__ " daemon: unrecognized operation %u\n",
843 hdr->operation));
846 done:
847 talloc_free(tmp_ctx);
851 called when the daemon gets a incoming packet
853 static void ctdb_daemon_read_cb(uint8_t *data, size_t cnt, void *args)
855 struct ctdb_client *client = talloc_get_type(args, struct ctdb_client);
856 struct ctdb_req_header *hdr;
858 if (cnt == 0) {
859 talloc_free(client);
860 return;
863 CTDB_INCREMENT_STAT(client->ctdb, client_packets_recv);
865 if (cnt < sizeof(*hdr)) {
866 ctdb_set_error(client->ctdb, "Bad packet length %u in daemon\n",
867 (unsigned)cnt);
868 return;
870 hdr = (struct ctdb_req_header *)data;
871 if (cnt != hdr->length) {
872 ctdb_set_error(client->ctdb, "Bad header length %u expected %u\n in daemon",
873 (unsigned)hdr->length, (unsigned)cnt);
874 return;
877 if (hdr->ctdb_magic != CTDB_MAGIC) {
878 ctdb_set_error(client->ctdb, "Non CTDB packet rejected\n");
879 return;
882 if (hdr->ctdb_version != CTDB_VERSION) {
883 ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
884 return;
887 DEBUG(DEBUG_DEBUG,(__location__ " client request %u of type %u length %u from "
888 "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
889 hdr->srcnode, hdr->destnode));
891 /* it is the responsibility of the incoming packet function to free 'data' */
892 daemon_incoming_packet(client, hdr);
896 static int ctdb_clientpid_destructor(struct ctdb_client_pid_list *client_pid)
898 if (client_pid->ctdb->client_pids != NULL) {
899 DLIST_REMOVE(client_pid->ctdb->client_pids, client_pid);
902 return 0;
906 static void ctdb_accept_client(struct event_context *ev, struct fd_event *fde,
907 uint16_t flags, void *private_data)
909 struct sockaddr_un addr;
910 socklen_t len;
911 int fd;
912 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
913 struct ctdb_client *client;
914 struct ctdb_client_pid_list *client_pid;
915 pid_t peer_pid = 0;
917 memset(&addr, 0, sizeof(addr));
918 len = sizeof(addr);
919 fd = accept(ctdb->daemon.sd, (struct sockaddr *)&addr, &len);
920 if (fd == -1) {
921 return;
924 set_nonblocking(fd);
925 set_close_on_exec(fd);
927 DEBUG(DEBUG_DEBUG,(__location__ " Created SOCKET FD:%d to connected child\n", fd));
929 client = talloc_zero(ctdb, struct ctdb_client);
930 if (ctdb_get_peer_pid(fd, &peer_pid) == 0) {
931 DEBUG(DEBUG_INFO,("Connected client with pid:%u\n", (unsigned)peer_pid));
934 client->ctdb = ctdb;
935 client->fd = fd;
936 client->client_id = ctdb_reqid_new(ctdb, client);
937 client->pid = peer_pid;
939 client_pid = talloc(client, struct ctdb_client_pid_list);
940 if (client_pid == NULL) {
941 DEBUG(DEBUG_ERR,("Failed to allocate client pid structure\n"));
942 close(fd);
943 talloc_free(client);
944 return;
946 client_pid->ctdb = ctdb;
947 client_pid->pid = peer_pid;
948 client_pid->client = client;
950 DLIST_ADD(ctdb->client_pids, client_pid);
952 client->queue = ctdb_queue_setup(ctdb, client, fd, CTDB_DS_ALIGNMENT,
953 ctdb_daemon_read_cb, client,
954 "client-%u", client->pid);
956 talloc_set_destructor(client, ctdb_client_destructor);
957 talloc_set_destructor(client_pid, ctdb_clientpid_destructor);
958 ctdb->num_clients++;
964 create a unix domain socket and bind it
965 return a file descriptor open on the socket
967 static int ux_socket_bind(struct ctdb_context *ctdb)
969 struct sockaddr_un addr;
971 ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
972 if (ctdb->daemon.sd == -1) {
973 return -1;
976 memset(&addr, 0, sizeof(addr));
977 addr.sun_family = AF_UNIX;
978 strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path)-1);
980 /* First check if an old ctdbd might be running */
981 if (connect(ctdb->daemon.sd,
982 (struct sockaddr *)&addr, sizeof(addr)) == 0) {
983 DEBUG(DEBUG_CRIT,
984 ("Something is already listening on ctdb socket '%s'\n",
985 ctdb->daemon.name));
986 goto failed;
989 /* Remove any old socket */
990 unlink(ctdb->daemon.name);
992 set_close_on_exec(ctdb->daemon.sd);
993 set_nonblocking(ctdb->daemon.sd);
995 if (bind(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
996 DEBUG(DEBUG_CRIT,("Unable to bind on ctdb socket '%s'\n", ctdb->daemon.name));
997 goto failed;
1000 if (chown(ctdb->daemon.name, geteuid(), getegid()) != 0 ||
1001 chmod(ctdb->daemon.name, 0700) != 0) {
1002 DEBUG(DEBUG_CRIT,("Unable to secure ctdb socket '%s', ctdb->daemon.name\n", ctdb->daemon.name));
1003 goto failed;
1007 if (listen(ctdb->daemon.sd, 100) != 0) {
1008 DEBUG(DEBUG_CRIT,("Unable to listen on ctdb socket '%s'\n", ctdb->daemon.name));
1009 goto failed;
1012 return 0;
1014 failed:
1015 close(ctdb->daemon.sd);
1016 ctdb->daemon.sd = -1;
1017 return -1;
1020 static void initialise_node_flags (struct ctdb_context *ctdb)
1022 if (ctdb->pnn == -1) {
1023 ctdb_fatal(ctdb, "PNN is set to -1 (unknown value)");
1026 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_DISCONNECTED;
1028 /* do we start out in DISABLED mode? */
1029 if (ctdb->start_as_disabled != 0) {
1030 DEBUG(DEBUG_INFO, ("This node is configured to start in DISABLED state\n"));
1031 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_DISABLED;
1033 /* do we start out in STOPPED mode? */
1034 if (ctdb->start_as_stopped != 0) {
1035 DEBUG(DEBUG_INFO, ("This node is configured to start in STOPPED state\n"));
1036 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
1040 static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
1041 void *private_data)
1043 if (status != 0) {
1044 ctdb_die(ctdb, "Failed to run setup event");
1046 ctdb_run_notification_script(ctdb, "setup");
1048 /* tell all other nodes we've just started up */
1049 ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL,
1050 0, CTDB_CONTROL_STARTUP, 0,
1051 CTDB_CTRL_FLAG_NOREPLY,
1052 tdb_null, NULL, NULL);
1054 /* Start the recovery daemon */
1055 if (ctdb_start_recoverd(ctdb) != 0) {
1056 DEBUG(DEBUG_ALERT,("Failed to start recovery daemon\n"));
1057 exit(11);
1060 ctdb_start_periodic_events(ctdb);
1062 ctdb_wait_for_first_recovery(ctdb);
1065 static struct timeval tevent_before_wait_ts;
1066 static struct timeval tevent_after_wait_ts;
1068 static void ctdb_tevent_trace(enum tevent_trace_point tp,
1069 void *private_data)
1071 struct timeval diff;
1072 struct timeval now;
1073 struct ctdb_context *ctdb =
1074 talloc_get_type(private_data, struct ctdb_context);
1076 if (getpid() != ctdb->ctdbd_pid) {
1077 return;
1080 now = timeval_current();
1082 switch (tp) {
1083 case TEVENT_TRACE_BEFORE_WAIT:
1084 if (!timeval_is_zero(&tevent_after_wait_ts)) {
1085 diff = timeval_until(&tevent_after_wait_ts, &now);
1086 if (diff.tv_sec > 3) {
1087 DEBUG(DEBUG_ERR,
1088 ("Handling event took %ld seconds!\n",
1089 diff.tv_sec));
1092 tevent_before_wait_ts = now;
1093 break;
1095 case TEVENT_TRACE_AFTER_WAIT:
1096 if (!timeval_is_zero(&tevent_before_wait_ts)) {
1097 diff = timeval_until(&tevent_before_wait_ts, &now);
1098 if (diff.tv_sec > 3) {
1099 DEBUG(DEBUG_CRIT,
1100 ("No event for %ld seconds!\n",
1101 diff.tv_sec));
1104 tevent_after_wait_ts = now;
1105 break;
1107 default:
1108 /* Do nothing for future tevent trace points */ ;
1112 static void ctdb_remove_pidfile(void)
1114 /* Only the main ctdbd's PID matches the SID */
1115 if (ctdbd_pidfile != NULL && getsid(0) == getpid()) {
1116 if (unlink(ctdbd_pidfile) == 0) {
1117 DEBUG(DEBUG_NOTICE, ("Removed PID file %s\n",
1118 ctdbd_pidfile));
1119 } else {
1120 DEBUG(DEBUG_WARNING, ("Failed to Remove PID file %s\n",
1121 ctdbd_pidfile));
1126 static void ctdb_create_pidfile(pid_t pid)
1128 if (ctdbd_pidfile != NULL) {
1129 FILE *fp;
1131 fp = fopen(ctdbd_pidfile, "w");
1132 if (fp == NULL) {
1133 DEBUG(DEBUG_ALERT,
1134 ("Failed to open PID file %s\n", ctdbd_pidfile));
1135 exit(11);
1138 fprintf(fp, "%d\n", pid);
1139 fclose(fp);
1140 DEBUG(DEBUG_NOTICE, ("Created PID file %s\n", ctdbd_pidfile));
1141 atexit(ctdb_remove_pidfile);
1146 start the protocol going as a daemon
1148 int ctdb_start_daemon(struct ctdb_context *ctdb, bool do_fork, bool use_syslog)
1150 int res, ret = -1;
1151 struct fd_event *fde;
1152 const char *domain_socket_name;
1154 /* create a unix domain stream socket to listen to */
1155 res = ux_socket_bind(ctdb);
1156 if (res!=0) {
1157 DEBUG(DEBUG_ALERT,("Cannot continue. Exiting!\n"));
1158 exit(10);
1161 if (do_fork && fork()) {
1162 return 0;
1165 tdb_reopen_all(false);
1167 if (do_fork) {
1168 if (setsid() == -1) {
1169 ctdb_die(ctdb, "Failed to setsid()\n");
1171 close(0);
1172 if (open("/dev/null", O_RDONLY) != 0) {
1173 DEBUG(DEBUG_ALERT,(__location__ " Failed to setup stdin on /dev/null\n"));
1174 exit(11);
1177 ignore_signal(SIGPIPE);
1179 ctdb->ctdbd_pid = getpid();
1180 DEBUG(DEBUG_ERR, ("Starting CTDBD (Version %s) as PID: %u\n",
1181 CTDB_VERSION_STRING, ctdb->ctdbd_pid));
1182 ctdb_create_pidfile(ctdb->ctdbd_pid);
1184 /* Make sure we log something when the daemon terminates.
1185 * This must be the first exit handler to run (so the last to
1186 * be registered.
1188 atexit(print_exit_message);
1190 if (ctdb->do_setsched) {
1191 /* try to set us up as realtime */
1192 set_scheduler();
1195 /* ensure the socket is deleted on exit of the daemon */
1196 domain_socket_name = talloc_strdup(talloc_autofree_context(), ctdb->daemon.name);
1197 if (domain_socket_name == NULL) {
1198 DEBUG(DEBUG_ALERT,(__location__ " talloc_strdup failed.\n"));
1199 exit(12);
1202 ctdb->ev = event_context_init(NULL);
1203 tevent_loop_allow_nesting(ctdb->ev);
1204 tevent_set_trace_callback(ctdb->ev, ctdb_tevent_trace, ctdb);
1205 ret = ctdb_init_tevent_logging(ctdb);
1206 if (ret != 0) {
1207 DEBUG(DEBUG_ALERT,("Failed to initialize TEVENT logging\n"));
1208 exit(1);
1211 /* set up a handler to pick up sigchld */
1212 if (ctdb_init_sigchld(ctdb) == NULL) {
1213 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
1214 exit(1);
1217 ctdb_set_child_logging(ctdb);
1218 if (use_syslog) {
1219 if (start_syslog_daemon(ctdb)) {
1220 DEBUG(DEBUG_CRIT, ("Failed to start syslog daemon\n"));
1221 exit(10);
1225 /* initialize statistics collection */
1226 ctdb_statistics_init(ctdb);
1228 /* force initial recovery for election */
1229 ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
1231 ctdb_set_runstate(ctdb, CTDB_RUNSTATE_INIT);
1232 ret = ctdb_event_script(ctdb, CTDB_EVENT_INIT);
1233 if (ret != 0) {
1234 ctdb_die(ctdb, "Failed to run init event\n");
1236 ctdb_run_notification_script(ctdb, "init");
1238 if (strcmp(ctdb->transport, "tcp") == 0) {
1239 int ctdb_tcp_init(struct ctdb_context *);
1240 ret = ctdb_tcp_init(ctdb);
1242 #ifdef USE_INFINIBAND
1243 if (strcmp(ctdb->transport, "ib") == 0) {
1244 int ctdb_ibw_init(struct ctdb_context *);
1245 ret = ctdb_ibw_init(ctdb);
1247 #endif
1248 if (ret != 0) {
1249 DEBUG(DEBUG_ERR,("Failed to initialise transport '%s'\n", ctdb->transport));
1250 return -1;
1253 if (ctdb->methods == NULL) {
1254 DEBUG(DEBUG_ALERT,(__location__ " Can not initialize transport. ctdb->methods is NULL\n"));
1255 ctdb_fatal(ctdb, "transport is unavailable. can not initialize.");
1258 /* initialise the transport */
1259 if (ctdb->methods->initialise(ctdb) != 0) {
1260 ctdb_fatal(ctdb, "transport failed to initialise");
1263 initialise_node_flags(ctdb);
1265 if (ctdb->public_addresses_file) {
1266 ret = ctdb_set_public_addresses(ctdb, true);
1267 if (ret == -1) {
1268 DEBUG(DEBUG_ALERT,("Unable to setup public address list\n"));
1269 exit(1);
1271 if (ctdb->do_checkpublicip) {
1272 ctdb_start_monitoring_interfaces(ctdb);
1277 /* attach to existing databases */
1278 if (ctdb_attach_databases(ctdb) != 0) {
1279 ctdb_fatal(ctdb, "Failed to attach to databases\n");
1282 /* start frozen, then let the first election sort things out */
1283 if (!ctdb_blocking_freeze(ctdb)) {
1284 ctdb_fatal(ctdb, "Failed to get initial freeze\n");
1287 /* now start accepting clients, only can do this once frozen */
1288 fde = event_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd,
1289 EVENT_FD_READ,
1290 ctdb_accept_client, ctdb);
1291 if (fde == NULL) {
1292 ctdb_fatal(ctdb, "Failed to add daemon socket to event loop");
1294 tevent_fd_set_auto_close(fde);
1296 /* release any IPs we hold from previous runs of the daemon */
1297 if (ctdb->tunable.disable_ip_failover == 0) {
1298 ctdb_release_all_ips(ctdb);
1301 /* Start the transport */
1302 if (ctdb->methods->start(ctdb) != 0) {
1303 DEBUG(DEBUG_ALERT,("transport failed to start!\n"));
1304 ctdb_fatal(ctdb, "transport failed to start");
1307 /* Recovery daemon and timed events are started from the
1308 * callback, only after the setup event completes
1309 * successfully.
1311 ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SETUP);
1312 ret = ctdb_event_script_callback(ctdb,
1313 ctdb,
1314 ctdb_setup_event_callback,
1315 ctdb,
1316 CTDB_EVENT_SETUP,
1317 "%s",
1318 "");
1319 if (ret != 0) {
1320 DEBUG(DEBUG_CRIT,("Failed to set up 'setup' event\n"));
1321 exit(1);
1324 lockdown_memory(ctdb->valgrinding);
1326 /* go into a wait loop to allow other nodes to complete */
1327 event_loop_wait(ctdb->ev);
1329 DEBUG(DEBUG_CRIT,("event_loop_wait() returned. this should not happen\n"));
1330 exit(1);
1334 allocate a packet for use in daemon<->daemon communication
1336 struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb,
1337 TALLOC_CTX *mem_ctx,
1338 enum ctdb_operation operation,
1339 size_t length, size_t slength,
1340 const char *type)
1342 int size;
1343 struct ctdb_req_header *hdr;
1345 length = MAX(length, slength);
1346 size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
1348 if (ctdb->methods == NULL) {
1349 DEBUG(DEBUG_INFO,(__location__ " Unable to allocate transport packet for operation %u of length %u. Transport is DOWN.\n",
1350 operation, (unsigned)length));
1351 return NULL;
1354 hdr = (struct ctdb_req_header *)ctdb->methods->allocate_pkt(mem_ctx, size);
1355 if (hdr == NULL) {
1356 DEBUG(DEBUG_ERR,("Unable to allocate transport packet for operation %u of length %u\n",
1357 operation, (unsigned)length));
1358 return NULL;
1360 talloc_set_name_const(hdr, type);
1361 memset(hdr, 0, slength);
1362 hdr->length = length;
1363 hdr->operation = operation;
1364 hdr->ctdb_magic = CTDB_MAGIC;
1365 hdr->ctdb_version = CTDB_VERSION;
1366 hdr->generation = ctdb->vnn_map->generation;
1367 hdr->srcnode = ctdb->pnn;
1369 return hdr;
1372 struct daemon_control_state {
1373 struct daemon_control_state *next, *prev;
1374 struct ctdb_client *client;
1375 struct ctdb_req_control *c;
1376 uint32_t reqid;
1377 struct ctdb_node *node;
1381 callback when a control reply comes in
1383 static void daemon_control_callback(struct ctdb_context *ctdb,
1384 int32_t status, TDB_DATA data,
1385 const char *errormsg,
1386 void *private_data)
1388 struct daemon_control_state *state = talloc_get_type(private_data,
1389 struct daemon_control_state);
1390 struct ctdb_client *client = state->client;
1391 struct ctdb_reply_control *r;
1392 size_t len;
1393 int ret;
1395 /* construct a message to send to the client containing the data */
1396 len = offsetof(struct ctdb_reply_control, data) + data.dsize;
1397 if (errormsg) {
1398 len += strlen(errormsg);
1400 r = ctdbd_allocate_pkt(ctdb, state, CTDB_REPLY_CONTROL, len,
1401 struct ctdb_reply_control);
1402 CTDB_NO_MEMORY_VOID(ctdb, r);
1404 r->hdr.reqid = state->reqid;
1405 r->status = status;
1406 r->datalen = data.dsize;
1407 r->errorlen = 0;
1408 memcpy(&r->data[0], data.dptr, data.dsize);
1409 if (errormsg) {
1410 r->errorlen = strlen(errormsg);
1411 memcpy(&r->data[r->datalen], errormsg, r->errorlen);
1414 ret = daemon_queue_send(client, &r->hdr);
1415 if (ret != -1) {
1416 talloc_free(state);
1421 fail all pending controls to a disconnected node
1423 void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node)
1425 struct daemon_control_state *state;
1426 while ((state = node->pending_controls)) {
1427 DLIST_REMOVE(node->pending_controls, state);
1428 daemon_control_callback(ctdb, (uint32_t)-1, tdb_null,
1429 "node is disconnected", state);
1434 destroy a daemon_control_state
1436 static int daemon_control_destructor(struct daemon_control_state *state)
1438 if (state->node) {
1439 DLIST_REMOVE(state->node->pending_controls, state);
1441 return 0;
1445 this is called when the ctdb daemon received a ctdb request control
1446 from a local client over the unix domain socket
1448 static void daemon_request_control_from_client(struct ctdb_client *client,
1449 struct ctdb_req_control *c)
1451 TDB_DATA data;
1452 int res;
1453 struct daemon_control_state *state;
1454 TALLOC_CTX *tmp_ctx = talloc_new(client);
1456 if (c->hdr.destnode == CTDB_CURRENT_NODE) {
1457 c->hdr.destnode = client->ctdb->pnn;
1460 state = talloc(client, struct daemon_control_state);
1461 CTDB_NO_MEMORY_VOID(client->ctdb, state);
1463 state->client = client;
1464 state->c = talloc_steal(state, c);
1465 state->reqid = c->hdr.reqid;
1466 if (ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
1467 state->node = client->ctdb->nodes[c->hdr.destnode];
1468 DLIST_ADD(state->node->pending_controls, state);
1469 } else {
1470 state->node = NULL;
1473 talloc_set_destructor(state, daemon_control_destructor);
1475 if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
1476 talloc_steal(tmp_ctx, state);
1479 data.dptr = &c->data[0];
1480 data.dsize = c->datalen;
1481 res = ctdb_daemon_send_control(client->ctdb, c->hdr.destnode,
1482 c->srvid, c->opcode, client->client_id,
1483 c->flags,
1484 data, daemon_control_callback,
1485 state);
1486 if (res != 0) {
1487 DEBUG(DEBUG_ERR,(__location__ " Failed to send control to remote node %u\n",
1488 c->hdr.destnode));
1491 talloc_free(tmp_ctx);
1495 register a call function
1497 int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id,
1498 ctdb_fn_t fn, int id)
1500 struct ctdb_registered_call *call;
1501 struct ctdb_db_context *ctdb_db;
1503 ctdb_db = find_ctdb_db(ctdb, db_id);
1504 if (ctdb_db == NULL) {
1505 return -1;
1508 call = talloc(ctdb_db, struct ctdb_registered_call);
1509 call->fn = fn;
1510 call->id = id;
1512 DLIST_ADD(ctdb_db->calls, call);
1513 return 0;
1519 this local messaging handler is ugly, but is needed to prevent
1520 recursion in ctdb_send_message() when the destination node is the
1521 same as the source node
1523 struct ctdb_local_message {
1524 struct ctdb_context *ctdb;
1525 uint64_t srvid;
1526 TDB_DATA data;
1529 static void ctdb_local_message_trigger(struct event_context *ev, struct timed_event *te,
1530 struct timeval t, void *private_data)
1532 struct ctdb_local_message *m = talloc_get_type(private_data,
1533 struct ctdb_local_message);
1534 int res;
1536 res = ctdb_dispatch_message(m->ctdb, m->srvid, m->data);
1537 if (res != 0) {
1538 DEBUG(DEBUG_ERR, (__location__ " Failed to dispatch message for srvid=%llu\n",
1539 (unsigned long long)m->srvid));
1541 talloc_free(m);
1544 static int ctdb_local_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
1546 struct ctdb_local_message *m;
1547 m = talloc(ctdb, struct ctdb_local_message);
1548 CTDB_NO_MEMORY(ctdb, m);
1550 m->ctdb = ctdb;
1551 m->srvid = srvid;
1552 m->data = data;
1553 m->data.dptr = talloc_memdup(m, m->data.dptr, m->data.dsize);
1554 if (m->data.dptr == NULL) {
1555 talloc_free(m);
1556 return -1;
1559 /* this needs to be done as an event to prevent recursion */
1560 event_add_timed(ctdb->ev, m, timeval_zero(), ctdb_local_message_trigger, m);
1561 return 0;
1565 send a ctdb message
1567 int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn,
1568 uint64_t srvid, TDB_DATA data)
1570 struct ctdb_req_message *r;
1571 int len;
1573 if (ctdb->methods == NULL) {
1574 DEBUG(DEBUG_INFO,(__location__ " Failed to send message. Transport is DOWN\n"));
1575 return -1;
1578 /* see if this is a message to ourselves */
1579 if (pnn == ctdb->pnn) {
1580 return ctdb_local_message(ctdb, srvid, data);
1583 len = offsetof(struct ctdb_req_message, data) + data.dsize;
1584 r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_MESSAGE, len,
1585 struct ctdb_req_message);
1586 CTDB_NO_MEMORY(ctdb, r);
1588 r->hdr.destnode = pnn;
1589 r->srvid = srvid;
1590 r->datalen = data.dsize;
1591 memcpy(&r->data[0], data.dptr, data.dsize);
1593 ctdb_queue_packet(ctdb, &r->hdr);
1595 talloc_free(r);
1596 return 0;
1601 struct ctdb_client_notify_list {
1602 struct ctdb_client_notify_list *next, *prev;
1603 struct ctdb_context *ctdb;
1604 uint64_t srvid;
1605 TDB_DATA data;
1609 static int ctdb_client_notify_destructor(struct ctdb_client_notify_list *nl)
1611 int ret;
1613 DEBUG(DEBUG_ERR,("Sending client notify message for srvid:%llu\n", (unsigned long long)nl->srvid));
1615 ret = ctdb_daemon_send_message(nl->ctdb, CTDB_BROADCAST_CONNECTED, (unsigned long long)nl->srvid, nl->data);
1616 if (ret != 0) {
1617 DEBUG(DEBUG_ERR,("Failed to send client notify message\n"));
1620 return 0;
1623 int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
1625 struct ctdb_client_notify_register *notify = (struct ctdb_client_notify_register *)indata.dptr;
1626 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1627 struct ctdb_client_notify_list *nl;
1629 DEBUG(DEBUG_INFO,("Register srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
1631 if (indata.dsize < offsetof(struct ctdb_client_notify_register, notify_data)) {
1632 DEBUG(DEBUG_ERR,(__location__ " Too little data in control : %d\n", (int)indata.dsize));
1633 return -1;
1636 if (indata.dsize != (notify->len + offsetof(struct ctdb_client_notify_register, notify_data))) {
1637 DEBUG(DEBUG_ERR,(__location__ " Wrong amount of data in control. Got %d, expected %d\n", (int)indata.dsize, (int)(notify->len + offsetof(struct ctdb_client_notify_register, notify_data))));
1638 return -1;
1642 if (client == NULL) {
1643 DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
1644 return -1;
1647 for(nl=client->notify; nl; nl=nl->next) {
1648 if (nl->srvid == notify->srvid) {
1649 break;
1652 if (nl != NULL) {
1653 DEBUG(DEBUG_ERR,(__location__ " Notification for srvid:%llu already exists for this client\n", (unsigned long long)notify->srvid));
1654 return -1;
1657 nl = talloc(client, struct ctdb_client_notify_list);
1658 CTDB_NO_MEMORY(ctdb, nl);
1659 nl->ctdb = ctdb;
1660 nl->srvid = notify->srvid;
1661 nl->data.dsize = notify->len;
1662 nl->data.dptr = talloc_size(nl, nl->data.dsize);
1663 CTDB_NO_MEMORY(ctdb, nl->data.dptr);
1664 memcpy(nl->data.dptr, notify->notify_data, nl->data.dsize);
1666 DLIST_ADD(client->notify, nl);
1667 talloc_set_destructor(nl, ctdb_client_notify_destructor);
1669 return 0;
1672 int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
1674 struct ctdb_client_notify_deregister *notify = (struct ctdb_client_notify_deregister *)indata.dptr;
1675 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1676 struct ctdb_client_notify_list *nl;
1678 DEBUG(DEBUG_INFO,("Deregister srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
1680 if (client == NULL) {
1681 DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
1682 return -1;
1685 for(nl=client->notify; nl; nl=nl->next) {
1686 if (nl->srvid == notify->srvid) {
1687 break;
1690 if (nl == NULL) {
1691 DEBUG(DEBUG_ERR,(__location__ " No notification for srvid:%llu found for this client\n", (unsigned long long)notify->srvid));
1692 return -1;
1695 DLIST_REMOVE(client->notify, nl);
1696 talloc_set_destructor(nl, NULL);
1697 talloc_free(nl);
1699 return 0;
1702 struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid)
1704 struct ctdb_client_pid_list *client_pid;
1706 for (client_pid = ctdb->client_pids; client_pid; client_pid=client_pid->next) {
1707 if (client_pid->pid == pid) {
1708 return client_pid->client;
1711 return NULL;
1715 /* This control is used by samba when probing if a process (of a samba daemon)
1716 exists on the node.
1717 Samba does this when it needs/wants to check if a subrecord in one of the
1718 databases is still valied, or if it is stale and can be removed.
1719 If the node is in unhealthy or stopped state we just kill of the samba
1720 process holding htis sub-record and return to the calling samba that
1721 the process does not exist.
1722 This allows us to forcefully recall subrecords registered by samba processes
1723 on banned and stopped nodes.
1725 int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid)
1727 struct ctdb_client *client;
1729 if (ctdb->nodes[ctdb->pnn]->flags & (NODE_FLAGS_BANNED|NODE_FLAGS_STOPPED)) {
1730 client = ctdb_find_client_by_pid(ctdb, pid);
1731 if (client != NULL) {
1732 DEBUG(DEBUG_NOTICE,(__location__ " Killing client with pid:%d on banned/stopped node\n", (int)pid));
1733 talloc_free(client);
1735 return -1;
1738 return kill(pid, 0);
1741 void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code)
1743 if (ctdb->runstate == CTDB_RUNSTATE_SHUTDOWN) {
1744 DEBUG(DEBUG_NOTICE,("Already shutting down so will not proceed.\n"));
1745 return;
1748 DEBUG(DEBUG_NOTICE,("Shutdown sequence commencing.\n"));
1749 ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SHUTDOWN);
1750 ctdb_stop_recoverd(ctdb);
1751 ctdb_stop_keepalive(ctdb);
1752 ctdb_stop_monitoring(ctdb);
1753 ctdb_release_all_ips(ctdb);
1754 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
1755 if (ctdb->methods != NULL) {
1756 ctdb->methods->shutdown(ctdb);
1759 DEBUG(DEBUG_NOTICE,("Shutdown sequence complete, exiting.\n"));
1760 exit(exit_code);