2 ctdb main protocol code
4 Copyright (C) Andrew Tridgell 2006
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "lib/util/dlinklist.h"
23 #include "system/network.h"
24 #include "system/filesys.h"
25 #include "../include/ctdb_private.h"
28 choose the transport we will use
30 int ctdb_set_transport(struct ctdb_context
*ctdb
, const char *transport
)
32 ctdb
->transport
= talloc_strdup(ctdb
, transport
);
33 CTDB_NO_MEMORY(ctdb
, ctdb
->transport
);
39 Check whether an ip is a valid node ip
40 Returns the node id for this ip address or -1
42 int ctdb_ip_to_nodeid(struct ctdb_context
*ctdb
, const char *nodeip
)
46 for (nodeid
=0;nodeid
<ctdb
->num_nodes
;nodeid
++) {
47 if (ctdb
->nodes
[nodeid
]->flags
& NODE_FLAGS_DELETED
) {
50 if (!strcmp(ctdb
->nodes
[nodeid
]->address
.address
, nodeip
)) {
59 choose the recovery lock file
61 int ctdb_set_recovery_lock_file(struct ctdb_context
*ctdb
, const char *file
)
63 if (ctdb
->recovery_lock_file
!= NULL
) {
64 talloc_free(ctdb
->recovery_lock_file
);
65 ctdb
->recovery_lock_file
= NULL
;
69 DEBUG(DEBUG_ALERT
,("Recovery lock file set to \"\". Disabling recovery lock checking\n"));
70 ctdb
->tunable
.verify_recovery_lock
= 0;
74 ctdb
->recovery_lock_file
= talloc_strdup(ctdb
, file
);
75 CTDB_NO_MEMORY(ctdb
, ctdb
->recovery_lock_file
);
81 add a node to the list of nodes
83 static int ctdb_add_node(struct ctdb_context
*ctdb
, char *nstr
)
85 struct ctdb_node
*node
, **nodep
;
87 nodep
= talloc_realloc(ctdb
, ctdb
->nodes
, struct ctdb_node
*, ctdb
->num_nodes
+1);
88 CTDB_NO_MEMORY(ctdb
, nodep
);
91 nodep
= &ctdb
->nodes
[ctdb
->num_nodes
];
92 (*nodep
) = talloc_zero(ctdb
->nodes
, struct ctdb_node
);
93 CTDB_NO_MEMORY(ctdb
, *nodep
);
96 if (ctdb_parse_address(ctdb
, node
, nstr
, &node
->address
) != 0) {
100 node
->name
= talloc_asprintf(node
, "%s:%u",
101 node
->address
.address
,
103 /* this assumes that the nodes are kept in sorted order, and no gaps */
104 node
->pnn
= ctdb
->num_nodes
;
106 /* nodes start out disconnected and unhealthy */
107 node
->flags
= (NODE_FLAGS_DISCONNECTED
| NODE_FLAGS_UNHEALTHY
);
109 if (ctdb
->address
.address
&&
110 ctdb_same_address(&ctdb
->address
, &node
->address
)) {
111 /* for automatic binding to interfaces, see tcp_connect.c */
112 ctdb
->pnn
= node
->pnn
;
116 node
->dead_count
= 0;
122 add an entry for a "deleted" node to the list of nodes.
123 a "deleted" node is a node that is commented out from the nodes file.
124 this is used to prevent that subsequent nodes in the nodes list
125 change their pnn value if a node is "delete" by commenting it out and then
126 using "ctdb reloadnodes" at runtime.
128 static int ctdb_add_deleted_node(struct ctdb_context
*ctdb
)
130 struct ctdb_node
*node
, **nodep
;
132 nodep
= talloc_realloc(ctdb
, ctdb
->nodes
, struct ctdb_node
*, ctdb
->num_nodes
+1);
133 CTDB_NO_MEMORY(ctdb
, nodep
);
136 nodep
= &ctdb
->nodes
[ctdb
->num_nodes
];
137 (*nodep
) = talloc_zero(ctdb
->nodes
, struct ctdb_node
);
138 CTDB_NO_MEMORY(ctdb
, *nodep
);
141 if (ctdb_parse_address(ctdb
, node
, "0.0.0.0", &node
->address
) != 0) {
142 DEBUG(DEBUG_ERR
,("Failed to setup deleted node %d\n", ctdb
->num_nodes
));
146 node
->name
= talloc_strdup(node
, "0.0.0.0:0");
148 /* this assumes that the nodes are kept in sorted order, and no gaps */
149 node
->pnn
= ctdb
->num_nodes
;
151 /* this node is permanently deleted/disconnected */
152 node
->flags
= NODE_FLAGS_DELETED
|NODE_FLAGS_DISCONNECTED
;
155 node
->dead_count
= 0;
162 setup the node list from a file
164 static int ctdb_set_nlist(struct ctdb_context
*ctdb
, const char *nlist
)
168 int i
, j
, num_present
;
170 talloc_free(ctdb
->nodes
);
174 lines
= file_lines_load(nlist
, &nlines
, ctdb
);
176 ctdb_set_error(ctdb
, "Failed to load nlist '%s'\n", nlist
);
179 while (nlines
> 0 && strcmp(lines
[nlines
-1], "") == 0) {
184 for (i
=0; i
< nlines
; i
++) {
188 /* strip leading spaces */
189 while((*node
== ' ') || (*node
== '\t')) {
193 if (ctdb_add_deleted_node(ctdb
) != 0) {
199 if (strcmp(node
, "") == 0) {
202 if (ctdb_add_node(ctdb
, node
) != 0) {
209 /* initialize the vnn mapping table now that we have the nodes list,
210 skipping any deleted nodes
212 ctdb
->vnn_map
= talloc(ctdb
, struct ctdb_vnn_map
);
213 CTDB_NO_MEMORY(ctdb
, ctdb
->vnn_map
);
215 ctdb
->vnn_map
->generation
= INVALID_GENERATION
;
216 ctdb
->vnn_map
->size
= num_present
;
217 ctdb
->vnn_map
->map
= talloc_array(ctdb
->vnn_map
, uint32_t, ctdb
->vnn_map
->size
);
218 CTDB_NO_MEMORY(ctdb
, ctdb
->vnn_map
->map
);
220 for(i
=0, j
=0; i
< ctdb
->vnn_map
->size
; i
++) {
221 if (ctdb
->nodes
[i
]->flags
& NODE_FLAGS_DELETED
) {
224 ctdb
->vnn_map
->map
[j
] = i
;
232 void ctdb_load_nodes_file(struct ctdb_context
*ctdb
)
236 ret
= ctdb_set_nlist(ctdb
, ctdb
->nodes_file
);
238 DEBUG(DEBUG_ALERT
,("ctdb_set_nlist failed - %s\n", ctdb_errstr(ctdb
)));
244 setup the local node address
246 int ctdb_set_address(struct ctdb_context
*ctdb
, const char *address
)
248 if (ctdb_parse_address(ctdb
, ctdb
, address
, &ctdb
->address
) != 0) {
252 ctdb
->name
= talloc_asprintf(ctdb
, "%s:%u",
253 ctdb
->address
.address
,
260 return the number of active nodes
262 uint32_t ctdb_get_num_active_nodes(struct ctdb_context
*ctdb
)
266 for (i
=0; i
< ctdb
->num_nodes
; i
++) {
267 if (!(ctdb
->nodes
[i
]->flags
& NODE_FLAGS_INACTIVE
)) {
276 called when we need to process a packet. This can be a requeued packet
277 after a lockwait, or a real packet from another node
279 void ctdb_input_pkt(struct ctdb_context
*ctdb
, struct ctdb_req_header
*hdr
)
283 /* place the packet as a child of the tmp_ctx. We then use
284 talloc_free() below to free it. If any of the calls want
285 to keep it, then they will steal it somewhere else, and the
286 talloc_free() will only free the tmp_ctx */
287 tmp_ctx
= talloc_new(ctdb
);
288 talloc_steal(tmp_ctx
, hdr
);
290 DEBUG(DEBUG_DEBUG
,(__location__
" ctdb request %u of type %u length %u from "
291 "node %u to %u\n", hdr
->reqid
, hdr
->operation
, hdr
->length
,
292 hdr
->srcnode
, hdr
->destnode
));
294 switch (hdr
->operation
) {
296 case CTDB_REPLY_CALL
:
297 case CTDB_REQ_DMASTER
:
298 case CTDB_REPLY_DMASTER
:
299 /* we dont allow these calls when banned */
300 if (ctdb
->nodes
[ctdb
->pnn
]->flags
& NODE_FLAGS_BANNED
) {
301 DEBUG(DEBUG_DEBUG
,(__location__
" ctdb operation %u"
303 " length %u from node %u to %u while node"
305 hdr
->operation
, hdr
->reqid
,
307 hdr
->srcnode
, hdr
->destnode
));
311 /* for ctdb_call inter-node operations verify that the
312 remote node that sent us the call is running in the
313 same generation instance as this node
315 if (ctdb
->vnn_map
->generation
!= hdr
->generation
) {
316 DEBUG(DEBUG_DEBUG
,(__location__
" ctdb operation %u"
318 " length %u from node %u to %u had an"
319 " invalid generation id:%u while our"
320 " generation id is:%u\n",
321 hdr
->operation
, hdr
->reqid
,
323 hdr
->srcnode
, hdr
->destnode
,
324 hdr
->generation
, ctdb
->vnn_map
->generation
));
329 switch (hdr
->operation
) {
331 CTDB_INCREMENT_STAT(ctdb
, node
.req_call
);
332 ctdb_request_call(ctdb
, hdr
);
335 case CTDB_REPLY_CALL
:
336 CTDB_INCREMENT_STAT(ctdb
, node
.reply_call
);
337 ctdb_reply_call(ctdb
, hdr
);
340 case CTDB_REPLY_ERROR
:
341 CTDB_INCREMENT_STAT(ctdb
, node
.reply_error
);
342 ctdb_reply_error(ctdb
, hdr
);
345 case CTDB_REQ_DMASTER
:
346 CTDB_INCREMENT_STAT(ctdb
, node
.req_dmaster
);
347 ctdb_request_dmaster(ctdb
, hdr
);
350 case CTDB_REPLY_DMASTER
:
351 CTDB_INCREMENT_STAT(ctdb
, node
.reply_dmaster
);
352 ctdb_reply_dmaster(ctdb
, hdr
);
355 case CTDB_REQ_MESSAGE
:
356 CTDB_INCREMENT_STAT(ctdb
, node
.req_message
);
357 ctdb_request_message(ctdb
, hdr
);
360 case CTDB_REQ_CONTROL
:
361 CTDB_INCREMENT_STAT(ctdb
, node
.req_control
);
362 ctdb_request_control(ctdb
, hdr
);
365 case CTDB_REPLY_CONTROL
:
366 CTDB_INCREMENT_STAT(ctdb
, node
.reply_control
);
367 ctdb_reply_control(ctdb
, hdr
);
370 case CTDB_REQ_KEEPALIVE
:
371 CTDB_INCREMENT_STAT(ctdb
, keepalive_packets_recv
);
375 DEBUG(DEBUG_CRIT
,("%s: Packet with unknown operation %u\n",
376 __location__
, hdr
->operation
));
381 talloc_free(tmp_ctx
);
386 called by the transport layer when a node is dead
388 void ctdb_node_dead(struct ctdb_node
*node
)
390 if (node
->flags
& NODE_FLAGS_DISCONNECTED
) {
391 DEBUG(DEBUG_INFO
,("%s: node %s is already marked disconnected: %u connected\n",
392 node
->ctdb
->name
, node
->name
,
393 node
->ctdb
->num_connected
));
396 node
->ctdb
->num_connected
--;
397 node
->flags
|= NODE_FLAGS_DISCONNECTED
| NODE_FLAGS_UNHEALTHY
;
399 node
->dead_count
= 0;
401 DEBUG(DEBUG_NOTICE
,("%s: node %s is dead: %u connected\n",
402 node
->ctdb
->name
, node
->name
, node
->ctdb
->num_connected
));
403 ctdb_daemon_cancel_controls(node
->ctdb
, node
);
405 if (node
->ctdb
->methods
== NULL
) {
406 DEBUG(DEBUG_ERR
,(__location__
" Can not restart transport while shutting down daemon.\n"));
410 node
->ctdb
->methods
->restart(node
);
414 called by the transport layer when a node is connected
416 void ctdb_node_connected(struct ctdb_node
*node
)
418 if (!(node
->flags
& NODE_FLAGS_DISCONNECTED
)) {
419 DEBUG(DEBUG_INFO
,("%s: node %s is already marked connected: %u connected\n",
420 node
->ctdb
->name
, node
->name
,
421 node
->ctdb
->num_connected
));
424 node
->ctdb
->num_connected
++;
425 node
->dead_count
= 0;
426 node
->flags
&= ~NODE_FLAGS_DISCONNECTED
;
427 node
->flags
|= NODE_FLAGS_UNHEALTHY
;
429 ("%s: connected to %s - %u connected\n",
430 node
->ctdb
->name
, node
->name
, node
->ctdb
->num_connected
));
434 struct ctdb_context
*ctdb
;
435 struct ctdb_req_header
*hdr
;
440 triggered when a deferred packet is due
442 static void queue_next_trigger(struct event_context
*ev
, struct timed_event
*te
,
443 struct timeval t
, void *private_data
)
445 struct queue_next
*q
= talloc_get_type(private_data
, struct queue_next
);
446 ctdb_input_pkt(q
->ctdb
, q
->hdr
);
451 defer a packet, so it is processed on the next event loop
452 this is used for sending packets to ourselves
454 static void ctdb_defer_packet(struct ctdb_context
*ctdb
, struct ctdb_req_header
*hdr
)
456 struct queue_next
*q
;
457 q
= talloc(ctdb
, struct queue_next
);
459 DEBUG(DEBUG_ERR
,(__location__
" Failed to allocate deferred packet\n"));
463 q
->hdr
= talloc_memdup(ctdb
, hdr
, hdr
->length
);
464 if (q
->hdr
== NULL
) {
465 DEBUG(DEBUG_ERR
,("Error copying deferred packet to self\n"));
469 /* use this to put packets directly into our recv function */
470 ctdb_input_pkt(q
->ctdb
, q
->hdr
);
472 event_add_timed(ctdb
->ev
, q
, timeval_zero(), queue_next_trigger
, q
);
478 broadcast a packet to all nodes
480 static void ctdb_broadcast_packet_all(struct ctdb_context
*ctdb
,
481 struct ctdb_req_header
*hdr
)
484 for (i
=0; i
< ctdb
->num_nodes
; i
++) {
485 if (ctdb
->nodes
[i
]->flags
& NODE_FLAGS_DELETED
) {
488 hdr
->destnode
= ctdb
->nodes
[i
]->pnn
;
489 ctdb_queue_packet(ctdb
, hdr
);
494 broadcast a packet to all nodes in the current vnnmap
496 static void ctdb_broadcast_packet_vnnmap(struct ctdb_context
*ctdb
,
497 struct ctdb_req_header
*hdr
)
500 for (i
=0;i
<ctdb
->vnn_map
->size
;i
++) {
501 hdr
->destnode
= ctdb
->vnn_map
->map
[i
];
502 ctdb_queue_packet(ctdb
, hdr
);
507 broadcast a packet to all connected nodes
509 static void ctdb_broadcast_packet_connected(struct ctdb_context
*ctdb
,
510 struct ctdb_req_header
*hdr
)
513 for (i
=0; i
< ctdb
->num_nodes
; i
++) {
514 if (ctdb
->nodes
[i
]->flags
& NODE_FLAGS_DELETED
) {
517 if (!(ctdb
->nodes
[i
]->flags
& NODE_FLAGS_DISCONNECTED
)) {
518 hdr
->destnode
= ctdb
->nodes
[i
]->pnn
;
519 ctdb_queue_packet(ctdb
, hdr
);
525 queue a packet or die
527 void ctdb_queue_packet(struct ctdb_context
*ctdb
, struct ctdb_req_header
*hdr
)
529 struct ctdb_node
*node
;
531 switch (hdr
->destnode
) {
532 case CTDB_BROADCAST_ALL
:
533 ctdb_broadcast_packet_all(ctdb
, hdr
);
535 case CTDB_BROADCAST_VNNMAP
:
536 ctdb_broadcast_packet_vnnmap(ctdb
, hdr
);
538 case CTDB_BROADCAST_CONNECTED
:
539 ctdb_broadcast_packet_connected(ctdb
, hdr
);
543 CTDB_INCREMENT_STAT(ctdb
, node_packets_sent
);
545 if (!ctdb_validate_pnn(ctdb
, hdr
->destnode
)) {
546 DEBUG(DEBUG_CRIT
,(__location__
" cant send to node %u that does not exist\n",
551 node
= ctdb
->nodes
[hdr
->destnode
];
553 if (node
->flags
& NODE_FLAGS_DELETED
) {
554 DEBUG(DEBUG_ERR
, (__location__
" Can not queue packet to DELETED node %d\n", hdr
->destnode
));
558 if (node
->pnn
== ctdb
->pnn
) {
559 ctdb_defer_packet(ctdb
, hdr
);
563 if (ctdb
->methods
== NULL
) {
564 DEBUG(DEBUG_ALERT
, (__location__
" Can not queue packet. "
565 "Transport is DOWN\n"));
570 if (ctdb
->methods
->queue_pkt(node
, (uint8_t *)hdr
, hdr
->length
) != 0) {
571 ctdb_fatal(ctdb
, "Unable to queue packet\n");
579 a valgrind hack to allow us to get opcode specific backtraces
580 very ugly, and relies on no compiler optimisation!
582 void ctdb_queue_packet_opcode(struct ctdb_context
*ctdb
, struct ctdb_req_header
*hdr
, unsigned opcode
)
585 #define DO_OP(x) case x: ctdb_queue_packet(ctdb, hdr); break
687 ctdb_queue_packet(ctdb
, hdr
);