ctdb-daemon: Mark interfaces as "up" by default
[Samba.git] / ctdb / server / ctdb_takeover.c
blobe6b70e13cebfa12e0395163f7828a9b66b262985
1 /*
2 ctdb ip takeover code
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38 bool noiptakeover;
39 bool noiphost;
40 enum ctdb_runstate runstate;
43 struct ctdb_iface {
44 struct ctdb_iface *prev, *next;
45 const char *name;
46 bool link_up;
47 uint32_t references;
50 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
52 if (vnn->iface) {
53 return vnn->iface->name;
56 return "__none__";
59 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
61 struct ctdb_iface *i;
63 /* Verify that we dont have an entry for this ip yet */
64 for (i=ctdb->ifaces;i;i=i->next) {
65 if (strcmp(i->name, iface) == 0) {
66 return 0;
70 /* create a new structure for this interface */
71 i = talloc_zero(ctdb, struct ctdb_iface);
72 CTDB_NO_MEMORY_FATAL(ctdb, i);
73 i->name = talloc_strdup(i, iface);
74 CTDB_NO_MEMORY(ctdb, i->name);
76 i->link_up = true;
78 DLIST_ADD(ctdb->ifaces, i);
80 return 0;
83 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
84 const char *name)
86 int n;
88 for (n = 0; vnn->ifaces[n] != NULL; n++) {
89 if (strcmp(name, vnn->ifaces[n]) == 0) {
90 return true;
94 return false;
97 /* If any interfaces now have no possible IPs then delete them. This
98 * implementation is naive (i.e. simple) rather than clever
99 * (i.e. complex). Given that this is run on delip and that operation
100 * is rare, this doesn't need to be efficient - it needs to be
101 * foolproof. One alternative is reference counting, where the logic
102 * is distributed and can, therefore, be broken in multiple places.
103 * Another alternative is to build a red-black tree of interfaces that
104 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
105 * once) and then walking ctdb->ifaces once and deleting those not in
106 * the tree. Let's go to one of those if the naive implementation
107 * causes problems... :-)
109 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
110 struct ctdb_vnn *vnn)
112 struct ctdb_iface *i, *next;
114 /* For each interface, check if there's an IP using it. */
115 for (i = ctdb->ifaces; i != NULL; i = next) {
116 struct ctdb_vnn *tv;
117 bool found;
118 next = i->next;
120 /* Only consider interfaces named in the given VNN. */
121 if (!vnn_has_interface_with_name(vnn, i->name)) {
122 continue;
125 /* Is the "single IP" on this interface? */
126 if ((ctdb->single_ip_vnn != NULL) &&
127 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
128 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
129 /* Found, next interface please... */
130 continue;
132 /* Search for a vnn with this interface. */
133 found = false;
134 for (tv=ctdb->vnn; tv; tv=tv->next) {
135 if (vnn_has_interface_with_name(tv, i->name)) {
136 found = true;
137 break;
141 if (!found) {
142 /* None of the VNNs are using this interface. */
143 DLIST_REMOVE(ctdb->ifaces, i);
144 talloc_free(i);
150 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
151 const char *iface)
153 struct ctdb_iface *i;
155 for (i=ctdb->ifaces;i;i=i->next) {
156 if (strcmp(i->name, iface) == 0) {
157 return i;
161 return NULL;
164 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
165 struct ctdb_vnn *vnn)
167 int i;
168 struct ctdb_iface *cur = NULL;
169 struct ctdb_iface *best = NULL;
171 for (i=0; vnn->ifaces[i]; i++) {
173 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
174 if (cur == NULL) {
175 continue;
178 if (!cur->link_up) {
179 continue;
182 if (best == NULL) {
183 best = cur;
184 continue;
187 if (cur->references < best->references) {
188 best = cur;
189 continue;
193 return best;
196 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
197 struct ctdb_vnn *vnn)
199 struct ctdb_iface *best = NULL;
201 if (vnn->iface) {
202 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
203 "still assigned to iface '%s'\n",
204 ctdb_addr_to_str(&vnn->public_address),
205 ctdb_vnn_iface_string(vnn)));
206 return 0;
209 best = ctdb_vnn_best_iface(ctdb, vnn);
210 if (best == NULL) {
211 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
212 "cannot assign to iface any iface\n",
213 ctdb_addr_to_str(&vnn->public_address)));
214 return -1;
217 vnn->iface = best;
218 best->references++;
219 vnn->pnn = ctdb->pnn;
221 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
222 "now assigned to iface '%s' refs[%d]\n",
223 ctdb_addr_to_str(&vnn->public_address),
224 ctdb_vnn_iface_string(vnn),
225 best->references));
226 return 0;
229 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
230 struct ctdb_vnn *vnn)
232 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233 "now unassigned (old iface '%s' refs[%d])\n",
234 ctdb_addr_to_str(&vnn->public_address),
235 ctdb_vnn_iface_string(vnn),
236 vnn->iface?vnn->iface->references:0));
237 if (vnn->iface) {
238 vnn->iface->references--;
240 vnn->iface = NULL;
241 if (vnn->pnn == ctdb->pnn) {
242 vnn->pnn = -1;
246 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
247 struct ctdb_vnn *vnn)
249 int i;
251 if (vnn->delete_pending) {
252 return false;
255 if (vnn->iface && vnn->iface->link_up) {
256 return true;
259 for (i=0; vnn->ifaces[i]; i++) {
260 struct ctdb_iface *cur;
262 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
263 if (cur == NULL) {
264 continue;
267 if (cur->link_up) {
268 return true;
272 return false;
275 struct ctdb_takeover_arp {
276 struct ctdb_context *ctdb;
277 uint32_t count;
278 ctdb_sock_addr addr;
279 struct ctdb_tcp_array *tcparray;
280 struct ctdb_vnn *vnn;
285 lists of tcp endpoints
287 struct ctdb_tcp_list {
288 struct ctdb_tcp_list *prev, *next;
289 struct ctdb_tcp_connection connection;
293 list of clients to kill on IP release
295 struct ctdb_client_ip {
296 struct ctdb_client_ip *prev, *next;
297 struct ctdb_context *ctdb;
298 ctdb_sock_addr addr;
299 uint32_t client_id;
304 send a gratuitous arp
306 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
307 struct timeval t, void *private_data)
309 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
310 struct ctdb_takeover_arp);
311 int i, ret;
312 struct ctdb_tcp_array *tcparray;
313 const char *iface = ctdb_vnn_iface_string(arp->vnn);
315 ret = ctdb_sys_send_arp(&arp->addr, iface);
316 if (ret != 0) {
317 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
318 iface, strerror(errno)));
321 tcparray = arp->tcparray;
322 if (tcparray) {
323 for (i=0;i<tcparray->num;i++) {
324 struct ctdb_tcp_connection *tcon;
326 tcon = &tcparray->connections[i];
327 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
328 (unsigned)ntohs(tcon->dst_addr.ip.sin_port),
329 ctdb_addr_to_str(&tcon->src_addr),
330 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
331 ret = ctdb_sys_send_tcp(
332 &tcon->src_addr,
333 &tcon->dst_addr,
334 0, 0, 0);
335 if (ret != 0) {
336 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
337 ctdb_addr_to_str(&tcon->src_addr)));
342 arp->count++;
344 if (arp->count == CTDB_ARP_REPEAT) {
345 talloc_free(arp);
346 return;
349 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
350 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
351 ctdb_control_send_arp, arp);
354 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
355 struct ctdb_vnn *vnn)
357 struct ctdb_takeover_arp *arp;
358 struct ctdb_tcp_array *tcparray;
360 if (!vnn->takeover_ctx) {
361 vnn->takeover_ctx = talloc_new(vnn);
362 if (!vnn->takeover_ctx) {
363 return -1;
367 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
368 if (!arp) {
369 return -1;
372 arp->ctdb = ctdb;
373 arp->addr = vnn->public_address;
374 arp->vnn = vnn;
376 tcparray = vnn->tcp_array;
377 if (tcparray) {
378 /* add all of the known tcp connections for this IP to the
379 list of tcp connections to send tickle acks for */
380 arp->tcparray = talloc_steal(arp, tcparray);
382 vnn->tcp_array = NULL;
383 vnn->tcp_update_needed = true;
386 event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
387 timeval_zero(), ctdb_control_send_arp, arp);
389 return 0;
392 struct takeover_callback_state {
393 struct ctdb_req_control *c;
394 ctdb_sock_addr *addr;
395 struct ctdb_vnn *vnn;
398 struct ctdb_do_takeip_state {
399 struct ctdb_req_control *c;
400 struct ctdb_vnn *vnn;
404 called when takeip event finishes
406 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
407 void *private_data)
409 struct ctdb_do_takeip_state *state =
410 talloc_get_type(private_data, struct ctdb_do_takeip_state);
411 int32_t ret;
412 TDB_DATA data;
414 if (status != 0) {
415 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
417 if (status == -ETIME) {
418 ctdb_ban_self(ctdb);
420 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
421 ctdb_addr_to_str(&state->vnn->public_address),
422 ctdb_vnn_iface_string(state->vnn)));
423 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
425 node->flags |= NODE_FLAGS_UNHEALTHY;
426 talloc_free(state);
427 return;
430 if (ctdb->do_checkpublicip) {
432 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
433 if (ret != 0) {
434 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
435 talloc_free(state);
436 return;
441 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
442 data.dsize = strlen((char *)data.dptr) + 1;
443 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
445 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
448 /* the control succeeded */
449 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
450 talloc_free(state);
451 return;
454 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
456 state->vnn->update_in_flight = false;
457 return 0;
461 take over an ip address
463 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
464 struct ctdb_req_control *c,
465 struct ctdb_vnn *vnn)
467 int ret;
468 struct ctdb_do_takeip_state *state;
470 if (vnn->update_in_flight) {
471 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
472 "update for this IP already in flight\n",
473 ctdb_addr_to_str(&vnn->public_address),
474 vnn->public_netmask_bits));
475 return -1;
478 ret = ctdb_vnn_assign_iface(ctdb, vnn);
479 if (ret != 0) {
480 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
481 "assign a usable interface\n",
482 ctdb_addr_to_str(&vnn->public_address),
483 vnn->public_netmask_bits));
484 return -1;
487 state = talloc(vnn, struct ctdb_do_takeip_state);
488 CTDB_NO_MEMORY(ctdb, state);
490 state->c = talloc_steal(ctdb, c);
491 state->vnn = vnn;
493 vnn->update_in_flight = true;
494 talloc_set_destructor(state, ctdb_takeip_destructor);
496 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
497 ctdb_addr_to_str(&vnn->public_address),
498 vnn->public_netmask_bits,
499 ctdb_vnn_iface_string(vnn)));
501 ret = ctdb_event_script_callback(ctdb,
502 state,
503 ctdb_do_takeip_callback,
504 state,
505 CTDB_EVENT_TAKE_IP,
506 "%s %s %u",
507 ctdb_vnn_iface_string(vnn),
508 ctdb_addr_to_str(&vnn->public_address),
509 vnn->public_netmask_bits);
511 if (ret != 0) {
512 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
513 ctdb_addr_to_str(&vnn->public_address),
514 ctdb_vnn_iface_string(vnn)));
515 talloc_free(state);
516 return -1;
519 return 0;
522 struct ctdb_do_updateip_state {
523 struct ctdb_req_control *c;
524 struct ctdb_iface *old;
525 struct ctdb_vnn *vnn;
529 called when updateip event finishes
531 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
532 void *private_data)
534 struct ctdb_do_updateip_state *state =
535 talloc_get_type(private_data, struct ctdb_do_updateip_state);
536 int32_t ret;
538 if (status != 0) {
539 if (status == -ETIME) {
540 ctdb_ban_self(ctdb);
542 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
543 ctdb_addr_to_str(&state->vnn->public_address),
544 state->old->name,
545 ctdb_vnn_iface_string(state->vnn)));
548 * All we can do is reset the old interface
549 * and let the next run fix it
551 ctdb_vnn_unassign_iface(ctdb, state->vnn);
552 state->vnn->iface = state->old;
553 state->vnn->iface->references++;
555 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
556 talloc_free(state);
557 return;
560 if (ctdb->do_checkpublicip) {
562 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
563 if (ret != 0) {
564 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
565 talloc_free(state);
566 return;
571 /* the control succeeded */
572 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
573 talloc_free(state);
574 return;
577 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
579 state->vnn->update_in_flight = false;
580 return 0;
584 update (move) an ip address
586 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
587 struct ctdb_req_control *c,
588 struct ctdb_vnn *vnn)
590 int ret;
591 struct ctdb_do_updateip_state *state;
592 struct ctdb_iface *old = vnn->iface;
593 const char *new_name;
595 if (vnn->update_in_flight) {
596 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
597 "update for this IP already in flight\n",
598 ctdb_addr_to_str(&vnn->public_address),
599 vnn->public_netmask_bits));
600 return -1;
603 ctdb_vnn_unassign_iface(ctdb, vnn);
604 ret = ctdb_vnn_assign_iface(ctdb, vnn);
605 if (ret != 0) {
606 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
607 "assin a usable interface (old iface '%s')\n",
608 ctdb_addr_to_str(&vnn->public_address),
609 vnn->public_netmask_bits,
610 old->name));
611 return -1;
614 new_name = ctdb_vnn_iface_string(vnn);
615 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
616 /* A benign update from one interface onto itself.
617 * no need to run the eventscripts in this case, just return
618 * success.
620 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
621 return 0;
624 state = talloc(vnn, struct ctdb_do_updateip_state);
625 CTDB_NO_MEMORY(ctdb, state);
627 state->c = talloc_steal(ctdb, c);
628 state->old = old;
629 state->vnn = vnn;
631 vnn->update_in_flight = true;
632 talloc_set_destructor(state, ctdb_updateip_destructor);
634 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
635 "interface %s to %s\n",
636 ctdb_addr_to_str(&vnn->public_address),
637 vnn->public_netmask_bits,
638 old->name,
639 new_name));
641 ret = ctdb_event_script_callback(ctdb,
642 state,
643 ctdb_do_updateip_callback,
644 state,
645 CTDB_EVENT_UPDATE_IP,
646 "%s %s %s %u",
647 state->old->name,
648 new_name,
649 ctdb_addr_to_str(&vnn->public_address),
650 vnn->public_netmask_bits);
651 if (ret != 0) {
652 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
653 ctdb_addr_to_str(&vnn->public_address),
654 old->name, new_name));
655 talloc_free(state);
656 return -1;
659 return 0;
663 Find the vnn of the node that has a public ip address
664 returns -1 if the address is not known as a public address
666 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
668 struct ctdb_vnn *vnn;
670 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
671 if (ctdb_same_ip(&vnn->public_address, addr)) {
672 return vnn;
676 return NULL;
680 take over an ip address
682 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
683 struct ctdb_req_control *c,
684 TDB_DATA indata,
685 bool *async_reply)
687 int ret;
688 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
689 struct ctdb_vnn *vnn;
690 bool have_ip = false;
691 bool do_updateip = false;
692 bool do_takeip = false;
693 struct ctdb_iface *best_iface = NULL;
695 if (pip->pnn != ctdb->pnn) {
696 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
697 "with pnn %d, but we're node %d\n",
698 ctdb_addr_to_str(&pip->addr),
699 pip->pnn, ctdb->pnn));
700 return -1;
703 /* update out vnn list */
704 vnn = find_public_ip_vnn(ctdb, &pip->addr);
705 if (vnn == NULL) {
706 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
707 ctdb_addr_to_str(&pip->addr)));
708 return 0;
711 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
712 have_ip = ctdb_sys_have_ip(&pip->addr);
714 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
715 if (best_iface == NULL) {
716 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
717 "a usable interface (old %s, have_ip %d)\n",
718 ctdb_addr_to_str(&vnn->public_address),
719 vnn->public_netmask_bits,
720 ctdb_vnn_iface_string(vnn),
721 have_ip));
722 return -1;
725 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
726 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
727 have_ip = false;
731 if (vnn->iface == NULL && have_ip) {
732 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
733 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
734 ctdb_addr_to_str(&vnn->public_address)));
735 return 0;
738 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
739 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
740 "and we have it on iface[%s], but it was assigned to node %d"
741 "and we are node %d, banning ourself\n",
742 ctdb_addr_to_str(&vnn->public_address),
743 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
744 ctdb_ban_self(ctdb);
745 return -1;
748 if (vnn->pnn == -1 && have_ip) {
749 vnn->pnn = ctdb->pnn;
750 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
751 "and we already have it on iface[%s], update local daemon\n",
752 ctdb_addr_to_str(&vnn->public_address),
753 ctdb_vnn_iface_string(vnn)));
754 return 0;
757 if (vnn->iface) {
758 if (vnn->iface != best_iface) {
759 if (!vnn->iface->link_up) {
760 do_updateip = true;
761 } else if (vnn->iface->references > (best_iface->references + 1)) {
762 /* only move when the rebalance gains something */
763 do_updateip = true;
768 if (!have_ip) {
769 if (do_updateip) {
770 ctdb_vnn_unassign_iface(ctdb, vnn);
771 do_updateip = false;
773 do_takeip = true;
776 if (do_takeip) {
777 ret = ctdb_do_takeip(ctdb, c, vnn);
778 if (ret != 0) {
779 return -1;
781 } else if (do_updateip) {
782 ret = ctdb_do_updateip(ctdb, c, vnn);
783 if (ret != 0) {
784 return -1;
786 } else {
788 * The interface is up and the kernel known the ip
789 * => do nothing
791 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
792 ctdb_addr_to_str(&pip->addr),
793 vnn->public_netmask_bits,
794 ctdb_vnn_iface_string(vnn)));
795 return 0;
798 /* tell ctdb_control.c that we will be replying asynchronously */
799 *async_reply = true;
801 return 0;
805 kill any clients that are registered with a IP that is being released
807 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
809 struct ctdb_client_ip *ip;
811 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
812 ctdb_addr_to_str(addr)));
814 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
815 ctdb_sock_addr tmp_addr;
817 tmp_addr = ip->addr;
818 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
819 ip->client_id,
820 ctdb_addr_to_str(&ip->addr)));
822 if (ctdb_same_ip(&tmp_addr, addr)) {
823 struct ctdb_client *client = ctdb_reqid_find(ctdb,
824 ip->client_id,
825 struct ctdb_client);
826 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
827 ip->client_id,
828 ctdb_addr_to_str(&ip->addr),
829 client->pid));
831 if (client->pid != 0) {
832 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
833 (unsigned)client->pid,
834 ctdb_addr_to_str(addr),
835 ip->client_id));
836 kill(client->pid, SIGKILL);
842 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
844 DLIST_REMOVE(ctdb->vnn, vnn);
845 ctdb_vnn_unassign_iface(ctdb, vnn);
846 ctdb_remove_orphaned_ifaces(ctdb, vnn);
847 talloc_free(vnn);
851 called when releaseip event finishes
853 static void release_ip_callback(struct ctdb_context *ctdb, int status,
854 void *private_data)
856 struct takeover_callback_state *state =
857 talloc_get_type(private_data, struct takeover_callback_state);
858 TDB_DATA data;
860 if (status == -ETIME) {
861 ctdb_ban_self(ctdb);
864 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
865 if (ctdb_sys_have_ip(state->addr)) {
866 DEBUG(DEBUG_ERR,
867 ("IP %s still hosted during release IP callback, failing\n",
868 ctdb_addr_to_str(state->addr)));
869 ctdb_request_control_reply(ctdb, state->c,
870 NULL, -1, NULL);
871 talloc_free(state);
872 return;
876 /* send a message to all clients of this node telling them
877 that the cluster has been reconfigured and they should
878 release any sockets on this IP */
879 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
880 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
881 data.dsize = strlen((char *)data.dptr)+1;
883 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
885 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
887 /* kill clients that have registered with this IP */
888 release_kill_clients(ctdb, state->addr);
890 ctdb_vnn_unassign_iface(ctdb, state->vnn);
892 /* Process the IP if it has been marked for deletion */
893 if (state->vnn->delete_pending) {
894 do_delete_ip(ctdb, state->vnn);
895 state->vnn = NULL;
898 /* the control succeeded */
899 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
900 talloc_free(state);
903 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
905 if (state->vnn != NULL) {
906 state->vnn->update_in_flight = false;
908 return 0;
912 release an ip address
914 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
915 struct ctdb_req_control *c,
916 TDB_DATA indata,
917 bool *async_reply)
919 int ret;
920 struct takeover_callback_state *state;
921 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
922 struct ctdb_vnn *vnn;
923 char *iface;
925 /* update our vnn list */
926 vnn = find_public_ip_vnn(ctdb, &pip->addr);
927 if (vnn == NULL) {
928 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
929 ctdb_addr_to_str(&pip->addr)));
930 return 0;
932 vnn->pnn = pip->pnn;
934 /* stop any previous arps */
935 talloc_free(vnn->takeover_ctx);
936 vnn->takeover_ctx = NULL;
938 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
939 * lazy multicast to drop an IP from any node that isn't the
940 * intended new node. The following causes makes ctdbd ignore
941 * a release for any address it doesn't host.
943 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
944 if (!ctdb_sys_have_ip(&pip->addr)) {
945 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
946 ctdb_addr_to_str(&pip->addr),
947 vnn->public_netmask_bits,
948 ctdb_vnn_iface_string(vnn)));
949 ctdb_vnn_unassign_iface(ctdb, vnn);
950 return 0;
952 } else {
953 if (vnn->iface == NULL) {
954 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
955 ctdb_addr_to_str(&pip->addr),
956 vnn->public_netmask_bits));
957 return 0;
961 /* There is a potential race between take_ip and us because we
962 * update the VNN via a callback that run when the
963 * eventscripts have been run. Avoid the race by allowing one
964 * update to be in flight at a time.
966 if (vnn->update_in_flight) {
967 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
968 "update for this IP already in flight\n",
969 ctdb_addr_to_str(&vnn->public_address),
970 vnn->public_netmask_bits));
971 return -1;
974 iface = strdup(ctdb_vnn_iface_string(vnn));
976 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
977 ctdb_addr_to_str(&pip->addr),
978 vnn->public_netmask_bits,
979 iface,
980 pip->pnn));
982 state = talloc(ctdb, struct takeover_callback_state);
983 if (state == NULL) {
984 ctdb_set_error(ctdb, "Out of memory at %s:%d",
985 __FILE__, __LINE__);
986 free(iface);
987 return -1;
990 state->c = talloc_steal(state, c);
991 state->addr = talloc(state, ctdb_sock_addr);
992 if (state->addr == NULL) {
993 ctdb_set_error(ctdb, "Out of memory at %s:%d",
994 __FILE__, __LINE__);
995 free(iface);
996 talloc_free(state);
997 return -1;
999 *state->addr = pip->addr;
1000 state->vnn = vnn;
1002 vnn->update_in_flight = true;
1003 talloc_set_destructor(state, ctdb_releaseip_destructor);
1005 ret = ctdb_event_script_callback(ctdb,
1006 state, release_ip_callback, state,
1007 CTDB_EVENT_RELEASE_IP,
1008 "%s %s %u",
1009 iface,
1010 ctdb_addr_to_str(&pip->addr),
1011 vnn->public_netmask_bits);
1012 free(iface);
1013 if (ret != 0) {
1014 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1015 ctdb_addr_to_str(&pip->addr),
1016 ctdb_vnn_iface_string(vnn)));
1017 talloc_free(state);
1018 return -1;
1021 /* tell the control that we will be reply asynchronously */
1022 *async_reply = true;
1023 return 0;
1026 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1027 ctdb_sock_addr *addr,
1028 unsigned mask, const char *ifaces,
1029 bool check_address)
1031 struct ctdb_vnn *vnn;
1032 uint32_t num = 0;
1033 char *tmp;
1034 const char *iface;
1035 int i;
1036 int ret;
1038 tmp = strdup(ifaces);
1039 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1040 if (!ctdb_sys_check_iface_exists(iface)) {
1041 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1042 free(tmp);
1043 return -1;
1046 free(tmp);
1048 /* Verify that we dont have an entry for this ip yet */
1049 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1050 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1051 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1052 ctdb_addr_to_str(addr)));
1053 return -1;
1057 /* create a new vnn structure for this ip address */
1058 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1059 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1060 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1061 tmp = talloc_strdup(vnn, ifaces);
1062 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1063 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1064 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1065 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1066 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1067 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1068 num++;
1070 talloc_free(tmp);
1071 vnn->ifaces[num] = NULL;
1072 vnn->public_address = *addr;
1073 vnn->public_netmask_bits = mask;
1074 vnn->pnn = -1;
1075 if (check_address) {
1076 if (ctdb_sys_have_ip(addr)) {
1077 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1078 vnn->pnn = ctdb->pnn;
1082 for (i=0; vnn->ifaces[i]; i++) {
1083 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1084 if (ret != 0) {
1085 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1086 "for public_address[%s]\n",
1087 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1088 talloc_free(vnn);
1089 return -1;
1093 DLIST_ADD(ctdb->vnn, vnn);
1095 return 0;
1099 setup the public address lists from a file
1101 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1103 char **lines;
1104 int nlines;
1105 int i;
1107 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1108 if (lines == NULL) {
1109 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1110 return -1;
1112 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1113 nlines--;
1116 for (i=0;i<nlines;i++) {
1117 unsigned mask;
1118 ctdb_sock_addr addr;
1119 const char *addrstr;
1120 const char *ifaces;
1121 char *tok, *line;
1123 line = lines[i];
1124 while ((*line == ' ') || (*line == '\t')) {
1125 line++;
1127 if (*line == '#') {
1128 continue;
1130 if (strcmp(line, "") == 0) {
1131 continue;
1133 tok = strtok(line, " \t");
1134 addrstr = tok;
1135 tok = strtok(NULL, " \t");
1136 if (tok == NULL) {
1137 if (NULL == ctdb->default_public_interface) {
1138 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1139 i+1));
1140 talloc_free(lines);
1141 return -1;
1143 ifaces = ctdb->default_public_interface;
1144 } else {
1145 ifaces = tok;
1148 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1149 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1150 talloc_free(lines);
1151 return -1;
1153 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1154 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1155 talloc_free(lines);
1156 return -1;
1161 talloc_free(lines);
1162 return 0;
1165 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1166 const char *iface,
1167 const char *ip)
1169 struct ctdb_vnn *svnn;
1170 struct ctdb_iface *cur = NULL;
1171 bool ok;
1172 int ret;
1174 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1175 CTDB_NO_MEMORY(ctdb, svnn);
1177 svnn->ifaces = talloc_array(svnn, const char *, 2);
1178 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1179 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1180 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1181 svnn->ifaces[1] = NULL;
1183 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1184 if (!ok) {
1185 talloc_free(svnn);
1186 return -1;
1189 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1190 if (ret != 0) {
1191 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1192 "for single_ip[%s]\n",
1193 svnn->ifaces[0],
1194 ctdb_addr_to_str(&svnn->public_address)));
1195 talloc_free(svnn);
1196 return -1;
1199 /* assume the single public ip interface is initially "good" */
1200 cur = ctdb_find_iface(ctdb, iface);
1201 if (cur == NULL) {
1202 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1203 return -1;
1205 cur->link_up = true;
1207 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1208 if (ret != 0) {
1209 talloc_free(svnn);
1210 return -1;
1213 ctdb->single_ip_vnn = svnn;
1214 return 0;
1217 struct ctdb_public_ip_list {
1218 struct ctdb_public_ip_list *next;
1219 uint32_t pnn;
1220 ctdb_sock_addr addr;
1223 /* Given a physical node, return the number of
1224 public addresses that is currently assigned to this node.
1226 static int node_ip_coverage(struct ctdb_context *ctdb,
1227 int32_t pnn,
1228 struct ctdb_public_ip_list *ips)
1230 int num=0;
1232 for (;ips;ips=ips->next) {
1233 if (ips->pnn == pnn) {
1234 num++;
1237 return num;
1241 /* Can the given node host the given IP: is the public IP known to the
1242 * node and is NOIPHOST unset?
1244 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1245 struct ctdb_ipflags ipflags,
1246 struct ctdb_public_ip_list *ip)
1248 struct ctdb_all_public_ips *public_ips;
1249 int i;
1251 if (ipflags.noiphost) {
1252 return false;
1255 public_ips = ctdb->nodes[pnn]->available_public_ips;
1257 if (public_ips == NULL) {
1258 return false;
1261 for (i=0; i<public_ips->num; i++) {
1262 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1263 /* yes, this node can serve this public ip */
1264 return true;
1268 return false;
1271 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1272 struct ctdb_ipflags ipflags,
1273 struct ctdb_public_ip_list *ip)
1275 if (ipflags.noiptakeover) {
1276 return false;
1279 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1282 /* search the node lists list for a node to takeover this ip.
1283 pick the node that currently are serving the least number of ips
1284 so that the ips get spread out evenly.
1286 static int find_takeover_node(struct ctdb_context *ctdb,
1287 struct ctdb_ipflags *ipflags,
1288 struct ctdb_public_ip_list *ip,
1289 struct ctdb_public_ip_list *all_ips)
1291 int pnn, min=0, num;
1292 int i, numnodes;
1294 numnodes = talloc_array_length(ipflags);
1295 pnn = -1;
1296 for (i=0; i<numnodes; i++) {
1297 /* verify that this node can serve this ip */
1298 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1299 /* no it couldnt so skip to the next node */
1300 continue;
1303 num = node_ip_coverage(ctdb, i, all_ips);
1304 /* was this the first node we checked ? */
1305 if (pnn == -1) {
1306 pnn = i;
1307 min = num;
1308 } else {
1309 if (num < min) {
1310 pnn = i;
1311 min = num;
1315 if (pnn == -1) {
1316 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1317 ctdb_addr_to_str(&ip->addr)));
1319 return -1;
1322 ip->pnn = pnn;
1323 return 0;
1326 #define IP_KEYLEN 4
1327 static uint32_t *ip_key(ctdb_sock_addr *ip)
1329 static uint32_t key[IP_KEYLEN];
1331 bzero(key, sizeof(key));
1333 switch (ip->sa.sa_family) {
1334 case AF_INET:
1335 key[3] = htonl(ip->ip.sin_addr.s_addr);
1336 break;
1337 case AF_INET6: {
1338 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1339 key[0] = htonl(s6_a32[0]);
1340 key[1] = htonl(s6_a32[1]);
1341 key[2] = htonl(s6_a32[2]);
1342 key[3] = htonl(s6_a32[3]);
1343 break;
1345 default:
1346 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1347 return key;
1350 return key;
1353 static void *add_ip_callback(void *parm, void *data)
1355 struct ctdb_public_ip_list *this_ip = parm;
1356 struct ctdb_public_ip_list *prev_ip = data;
1358 if (prev_ip == NULL) {
1359 return parm;
1361 if (this_ip->pnn == -1) {
1362 this_ip->pnn = prev_ip->pnn;
1365 return parm;
1368 static int getips_count_callback(void *param, void *data)
1370 struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1371 struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1373 new_ip->next = *ip_list;
1374 *ip_list = new_ip;
1375 return 0;
1378 static struct ctdb_public_ip_list *
1379 create_merged_ip_list(struct ctdb_context *ctdb)
1381 int i, j;
1382 struct ctdb_public_ip_list *ip_list;
1383 struct ctdb_all_public_ips *public_ips;
1385 if (ctdb->ip_tree != NULL) {
1386 talloc_free(ctdb->ip_tree);
1387 ctdb->ip_tree = NULL;
1389 ctdb->ip_tree = trbt_create(ctdb, 0);
1391 for (i=0;i<ctdb->num_nodes;i++) {
1392 public_ips = ctdb->nodes[i]->known_public_ips;
1394 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1395 continue;
1398 /* there were no public ips for this node */
1399 if (public_ips == NULL) {
1400 continue;
1403 for (j=0;j<public_ips->num;j++) {
1404 struct ctdb_public_ip_list *tmp_ip;
1406 tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1407 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1408 /* Do not use information about IP addresses hosted
1409 * on other nodes, it may not be accurate */
1410 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1411 tmp_ip->pnn = public_ips->ips[j].pnn;
1412 } else {
1413 tmp_ip->pnn = -1;
1415 tmp_ip->addr = public_ips->ips[j].addr;
1416 tmp_ip->next = NULL;
1418 trbt_insertarray32_callback(ctdb->ip_tree,
1419 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1420 add_ip_callback,
1421 tmp_ip);
1425 ip_list = NULL;
1426 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1428 return ip_list;
1432 * This is the length of the longtest common prefix between the IPs.
1433 * It is calculated by XOR-ing the 2 IPs together and counting the
1434 * number of leading zeroes. The implementation means that all
1435 * addresses end up being 128 bits long.
1437 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1438 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1439 * lots of nodes and IP addresses?
1441 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1443 uint32_t ip1_k[IP_KEYLEN];
1444 uint32_t *t;
1445 int i;
1446 uint32_t x;
1448 uint32_t distance = 0;
1450 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1451 t = ip_key(ip2);
1452 for (i=0; i<IP_KEYLEN; i++) {
1453 x = ip1_k[i] ^ t[i];
1454 if (x == 0) {
1455 distance += 32;
1456 } else {
1457 /* Count number of leading zeroes.
1458 * FIXME? This could be optimised...
1460 while ((x & (1 << 31)) == 0) {
1461 x <<= 1;
1462 distance += 1;
1467 return distance;
1470 /* Calculate the IP distance for the given IP relative to IPs on the
1471 given node. The ips argument is generally the all_ips variable
1472 used in the main part of the algorithm.
1474 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1475 struct ctdb_public_ip_list *ips,
1476 int pnn)
1478 struct ctdb_public_ip_list *t;
1479 uint32_t d;
1481 uint32_t sum = 0;
1483 for (t=ips; t != NULL; t=t->next) {
1484 if (t->pnn != pnn) {
1485 continue;
1488 /* Optimisation: We never calculate the distance
1489 * between an address and itself. This allows us to
1490 * calculate the effect of removing an address from a
1491 * node by simply calculating the distance between
1492 * that address and all of the exitsing addresses.
1493 * Moreover, we assume that we're only ever dealing
1494 * with addresses from all_ips so we can identify an
1495 * address via a pointer rather than doing a more
1496 * expensive address comparison. */
1497 if (&(t->addr) == ip) {
1498 continue;
1501 d = ip_distance(ip, &(t->addr));
1502 sum += d * d; /* Cheaper than pulling in math.h :-) */
1505 return sum;
1508 /* Return the LCP2 imbalance metric for addresses currently assigned
1509 to the given node.
1511 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1513 struct ctdb_public_ip_list *t;
1515 uint32_t imbalance = 0;
1517 for (t=all_ips; t!=NULL; t=t->next) {
1518 if (t->pnn != pnn) {
1519 continue;
1521 /* Pass the rest of the IPs rather than the whole
1522 all_ips input list.
1524 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1527 return imbalance;
1530 /* Allocate any unassigned IPs just by looping through the IPs and
1531 * finding the best node for each.
1533 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1534 struct ctdb_ipflags *ipflags,
1535 struct ctdb_public_ip_list *all_ips)
1537 struct ctdb_public_ip_list *tmp_ip;
1539 /* loop over all ip's and find a physical node to cover for
1540 each unassigned ip.
1542 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1543 if (tmp_ip->pnn == -1) {
1544 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1545 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1546 ctdb_addr_to_str(&tmp_ip->addr)));
1552 /* Basic non-deterministic rebalancing algorithm.
1554 static void basic_failback(struct ctdb_context *ctdb,
1555 struct ctdb_ipflags *ipflags,
1556 struct ctdb_public_ip_list *all_ips,
1557 int num_ips)
1559 int i, numnodes;
1560 int maxnode, maxnum, minnode, minnum, num, retries;
1561 struct ctdb_public_ip_list *tmp_ip;
1563 numnodes = talloc_array_length(ipflags);
1564 retries = 0;
1566 try_again:
1567 maxnum=0;
1568 minnum=0;
1570 /* for each ip address, loop over all nodes that can serve
1571 this ip and make sure that the difference between the node
1572 serving the most and the node serving the least ip's are
1573 not greater than 1.
1575 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1576 if (tmp_ip->pnn == -1) {
1577 continue;
1580 /* Get the highest and lowest number of ips's served by any
1581 valid node which can serve this ip.
1583 maxnode = -1;
1584 minnode = -1;
1585 for (i=0; i<numnodes; i++) {
1586 /* only check nodes that can actually serve this ip */
1587 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1588 /* no it couldnt so skip to the next node */
1589 continue;
1592 num = node_ip_coverage(ctdb, i, all_ips);
1593 if (maxnode == -1) {
1594 maxnode = i;
1595 maxnum = num;
1596 } else {
1597 if (num > maxnum) {
1598 maxnode = i;
1599 maxnum = num;
1602 if (minnode == -1) {
1603 minnode = i;
1604 minnum = num;
1605 } else {
1606 if (num < minnum) {
1607 minnode = i;
1608 minnum = num;
1612 if (maxnode == -1) {
1613 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1614 ctdb_addr_to_str(&tmp_ip->addr)));
1616 continue;
1619 /* if the spread between the smallest and largest coverage by
1620 a node is >=2 we steal one of the ips from the node with
1621 most coverage to even things out a bit.
1622 try to do this a limited number of times since we dont
1623 want to spend too much time balancing the ip coverage.
1625 if ( (maxnum > minnum+1)
1626 && (retries < (num_ips + 5)) ){
1627 struct ctdb_public_ip_list *tmp;
1629 /* Reassign one of maxnode's VNNs */
1630 for (tmp=all_ips;tmp;tmp=tmp->next) {
1631 if (tmp->pnn == maxnode) {
1632 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1633 retries++;
1634 goto try_again;;
1641 static void lcp2_init(struct ctdb_context *tmp_ctx,
1642 struct ctdb_ipflags *ipflags,
1643 struct ctdb_public_ip_list *all_ips,
1644 uint32_t *force_rebalance_nodes,
1645 uint32_t **lcp2_imbalances,
1646 bool **rebalance_candidates)
1648 int i, numnodes;
1649 struct ctdb_public_ip_list *tmp_ip;
1651 numnodes = talloc_array_length(ipflags);
1653 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1654 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1655 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1656 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1658 for (i=0; i<numnodes; i++) {
1659 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1660 /* First step: assume all nodes are candidates */
1661 (*rebalance_candidates)[i] = true;
1664 /* 2nd step: if a node has IPs assigned then it must have been
1665 * healthy before, so we remove it from consideration. This
1666 * is overkill but is all we have because we don't maintain
1667 * state between takeover runs. An alternative would be to
1668 * keep state and invalidate it every time the recovery master
1669 * changes.
1671 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1672 if (tmp_ip->pnn != -1) {
1673 (*rebalance_candidates)[tmp_ip->pnn] = false;
1677 /* 3rd step: if a node is forced to re-balance then
1678 we allow failback onto the node */
1679 if (force_rebalance_nodes == NULL) {
1680 return;
1682 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1683 uint32_t pnn = force_rebalance_nodes[i];
1684 if (pnn >= numnodes) {
1685 DEBUG(DEBUG_ERR,
1686 (__location__ "unknown node %u\n", pnn));
1687 continue;
1690 DEBUG(DEBUG_NOTICE,
1691 ("Forcing rebalancing of IPs to node %u\n", pnn));
1692 (*rebalance_candidates)[pnn] = true;
1696 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1697 * the IP/node combination that will cost the least.
1699 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1700 struct ctdb_ipflags *ipflags,
1701 struct ctdb_public_ip_list *all_ips,
1702 uint32_t *lcp2_imbalances)
1704 struct ctdb_public_ip_list *tmp_ip;
1705 int dstnode, numnodes;
1707 int minnode;
1708 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1709 struct ctdb_public_ip_list *minip;
1711 bool should_loop = true;
1712 bool have_unassigned = true;
1714 numnodes = talloc_array_length(ipflags);
1716 while (have_unassigned && should_loop) {
1717 should_loop = false;
1719 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1720 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1722 minnode = -1;
1723 mindsum = 0;
1724 minip = NULL;
1726 /* loop over each unassigned ip. */
1727 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1728 if (tmp_ip->pnn != -1) {
1729 continue;
1732 for (dstnode=0; dstnode<numnodes; dstnode++) {
1733 /* only check nodes that can actually takeover this ip */
1734 if (!can_node_takeover_ip(ctdb, dstnode,
1735 ipflags[dstnode],
1736 tmp_ip)) {
1737 /* no it couldnt so skip to the next node */
1738 continue;
1741 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1742 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1743 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1744 ctdb_addr_to_str(&(tmp_ip->addr)),
1745 dstnode,
1746 dstimbl - lcp2_imbalances[dstnode]));
1749 if ((minnode == -1) || (dstdsum < mindsum)) {
1750 minnode = dstnode;
1751 minimbl = dstimbl;
1752 mindsum = dstdsum;
1753 minip = tmp_ip;
1754 should_loop = true;
1759 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1761 /* If we found one then assign it to the given node. */
1762 if (minnode != -1) {
1763 minip->pnn = minnode;
1764 lcp2_imbalances[minnode] = minimbl;
1765 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1766 ctdb_addr_to_str(&(minip->addr)),
1767 minnode,
1768 mindsum));
1771 /* There might be a better way but at least this is clear. */
1772 have_unassigned = false;
1773 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1774 if (tmp_ip->pnn == -1) {
1775 have_unassigned = true;
1780 /* We know if we have an unassigned addresses so we might as
1781 * well optimise.
1783 if (have_unassigned) {
1784 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1785 if (tmp_ip->pnn == -1) {
1786 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1787 ctdb_addr_to_str(&tmp_ip->addr)));
1793 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1794 * to move IPs from, determines the best IP/destination node
1795 * combination to move from the source node.
1797 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1798 struct ctdb_ipflags *ipflags,
1799 struct ctdb_public_ip_list *all_ips,
1800 int srcnode,
1801 uint32_t *lcp2_imbalances,
1802 bool *rebalance_candidates)
1804 int dstnode, mindstnode, numnodes;
1805 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1806 uint32_t minsrcimbl, mindstimbl;
1807 struct ctdb_public_ip_list *minip;
1808 struct ctdb_public_ip_list *tmp_ip;
1810 /* Find an IP and destination node that best reduces imbalance. */
1811 srcimbl = 0;
1812 minip = NULL;
1813 minsrcimbl = 0;
1814 mindstnode = -1;
1815 mindstimbl = 0;
1817 numnodes = talloc_array_length(ipflags);
1819 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1820 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1821 srcnode, lcp2_imbalances[srcnode]));
1823 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1824 /* Only consider addresses on srcnode. */
1825 if (tmp_ip->pnn != srcnode) {
1826 continue;
1829 /* What is this IP address costing the source node? */
1830 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1831 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1833 /* Consider this IP address would cost each potential
1834 * destination node. Destination nodes are limited to
1835 * those that are newly healthy, since we don't want
1836 * to do gratuitous failover of IPs just to make minor
1837 * balance improvements.
1839 for (dstnode=0; dstnode<numnodes; dstnode++) {
1840 if (!rebalance_candidates[dstnode]) {
1841 continue;
1844 /* only check nodes that can actually takeover this ip */
1845 if (!can_node_takeover_ip(ctdb, dstnode,
1846 ipflags[dstnode], tmp_ip)) {
1847 /* no it couldnt so skip to the next node */
1848 continue;
1851 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1852 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1853 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1854 srcnode, -srcdsum,
1855 ctdb_addr_to_str(&(tmp_ip->addr)),
1856 dstnode, dstdsum));
1858 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1859 (dstdsum < srcdsum) && \
1860 ((mindstnode == -1) || \
1861 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1863 minip = tmp_ip;
1864 minsrcimbl = srcimbl;
1865 mindstnode = dstnode;
1866 mindstimbl = dstimbl;
1870 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1872 if (mindstnode != -1) {
1873 /* We found a move that makes things better... */
1874 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1875 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1876 ctdb_addr_to_str(&(minip->addr)),
1877 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1880 lcp2_imbalances[srcnode] = minsrcimbl;
1881 lcp2_imbalances[mindstnode] = mindstimbl;
1882 minip->pnn = mindstnode;
1884 return true;
1887 return false;
1891 struct lcp2_imbalance_pnn {
1892 uint32_t imbalance;
1893 int pnn;
1896 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1898 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1899 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1901 if (lipa->imbalance > lipb->imbalance) {
1902 return -1;
1903 } else if (lipa->imbalance == lipb->imbalance) {
1904 return 0;
1905 } else {
1906 return 1;
1910 /* LCP2 algorithm for rebalancing the cluster. This finds the source
1911 * node with the highest LCP2 imbalance, and then determines the best
1912 * IP/destination node combination to move from the source node.
1914 static void lcp2_failback(struct ctdb_context *ctdb,
1915 struct ctdb_ipflags *ipflags,
1916 struct ctdb_public_ip_list *all_ips,
1917 uint32_t *lcp2_imbalances,
1918 bool *rebalance_candidates)
1920 int i, numnodes;
1921 struct lcp2_imbalance_pnn * lips;
1922 bool again;
1924 numnodes = talloc_array_length(ipflags);
1926 try_again:
1927 /* Put the imbalances and nodes into an array, sort them and
1928 * iterate through candidates. Usually the 1st one will be
1929 * used, so this doesn't cost much...
1931 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1932 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1933 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1934 for (i=0; i<numnodes; i++) {
1935 lips[i].imbalance = lcp2_imbalances[i];
1936 lips[i].pnn = i;
1937 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1939 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1940 lcp2_cmp_imbalance_pnn);
1942 again = false;
1943 for (i=0; i<numnodes; i++) {
1944 /* This means that all nodes had 0 or 1 addresses, so
1945 * can't be imbalanced.
1947 if (lips[i].imbalance == 0) {
1948 break;
1951 if (lcp2_failback_candidate(ctdb,
1952 ipflags,
1953 all_ips,
1954 lips[i].pnn,
1955 lcp2_imbalances,
1956 rebalance_candidates)) {
1957 again = true;
1958 break;
1962 talloc_free(lips);
1963 if (again) {
1964 goto try_again;
1968 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1969 struct ctdb_ipflags *ipflags,
1970 struct ctdb_public_ip_list *all_ips)
1972 struct ctdb_public_ip_list *tmp_ip;
1974 /* verify that the assigned nodes can serve that public ip
1975 and set it to -1 if not
1977 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1978 if (tmp_ip->pnn == -1) {
1979 continue;
1981 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1982 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1983 /* this node can not serve this ip. */
1984 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1985 ctdb_addr_to_str(&(tmp_ip->addr)),
1986 tmp_ip->pnn));
1987 tmp_ip->pnn = -1;
1992 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
1993 struct ctdb_ipflags *ipflags,
1994 struct ctdb_public_ip_list *all_ips)
1996 struct ctdb_public_ip_list *tmp_ip;
1997 int i, numnodes;
1999 numnodes = talloc_array_length(ipflags);
2001 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2002 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2003 * always be allocated the same way for a specific set of
2004 * available/unavailable nodes.
2007 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2008 tmp_ip->pnn = i % numnodes;
2011 /* IP failback doesn't make sense with deterministic
2012 * IPs, since the modulo step above implicitly fails
2013 * back IPs to their "home" node.
2015 if (1 == ctdb->tunable.no_ip_failback) {
2016 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2019 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2021 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2023 /* No failback here! */
2026 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2027 struct ctdb_ipflags *ipflags,
2028 struct ctdb_public_ip_list *all_ips)
2030 /* This should be pushed down into basic_failback. */
2031 struct ctdb_public_ip_list *tmp_ip;
2032 int num_ips = 0;
2033 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2034 num_ips++;
2037 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2039 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2041 /* If we don't want IPs to fail back then don't rebalance IPs. */
2042 if (1 == ctdb->tunable.no_ip_failback) {
2043 return;
2046 /* Now, try to make sure the ip adresses are evenly distributed
2047 across the nodes.
2049 basic_failback(ctdb, ipflags, all_ips, num_ips);
2052 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2053 struct ctdb_ipflags *ipflags,
2054 struct ctdb_public_ip_list *all_ips,
2055 uint32_t *force_rebalance_nodes)
2057 uint32_t *lcp2_imbalances;
2058 bool *rebalance_candidates;
2059 int numnodes, num_rebalance_candidates, i;
2061 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2063 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2065 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2066 &lcp2_imbalances, &rebalance_candidates);
2068 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2070 /* If we don't want IPs to fail back then don't rebalance IPs. */
2071 if (1 == ctdb->tunable.no_ip_failback) {
2072 goto finished;
2075 /* It is only worth continuing if we have suitable target
2076 * nodes to transfer IPs to. This check is much cheaper than
2077 * continuing on...
2079 numnodes = talloc_array_length(ipflags);
2080 num_rebalance_candidates = 0;
2081 for (i=0; i<numnodes; i++) {
2082 if (rebalance_candidates[i]) {
2083 num_rebalance_candidates++;
2086 if (num_rebalance_candidates == 0) {
2087 goto finished;
2090 /* Now, try to make sure the ip adresses are evenly distributed
2091 across the nodes.
2093 lcp2_failback(ctdb, ipflags, all_ips,
2094 lcp2_imbalances, rebalance_candidates);
2096 finished:
2097 talloc_free(tmp_ctx);
2100 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2102 int i;
2104 for (i=0;i<nodemap->num;i++) {
2105 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2106 /* Found one completely healthy node */
2107 return false;
2111 return true;
2114 /* The calculation part of the IP allocation algorithm. */
2115 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2116 struct ctdb_ipflags *ipflags,
2117 struct ctdb_public_ip_list **all_ips_p,
2118 uint32_t *force_rebalance_nodes)
2120 /* since nodes only know about those public addresses that
2121 can be served by that particular node, no single node has
2122 a full list of all public addresses that exist in the cluster.
2123 Walk over all node structures and create a merged list of
2124 all public addresses that exist in the cluster.
2126 keep the tree of ips around as ctdb->ip_tree
2128 *all_ips_p = create_merged_ip_list(ctdb);
2130 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2131 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2132 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2133 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2134 } else {
2135 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2138 /* at this point ->pnn is the node which will own each IP
2139 or -1 if there is no node that can cover this ip
2142 return;
2145 struct get_tunable_callback_data {
2146 const char *tunable;
2147 uint32_t *out;
2148 bool fatal;
2151 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2152 int32_t res, TDB_DATA outdata,
2153 void *callback)
2155 struct get_tunable_callback_data *cd =
2156 (struct get_tunable_callback_data *)callback;
2157 int size;
2159 if (res != 0) {
2160 /* Already handled in fail callback */
2161 return;
2164 if (outdata.dsize != sizeof(uint32_t)) {
2165 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2166 cd->tunable, pnn, (int)sizeof(uint32_t),
2167 (int)outdata.dsize));
2168 cd->fatal = true;
2169 return;
2172 size = talloc_array_length(cd->out);
2173 if (pnn >= size) {
2174 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2175 cd->tunable, pnn, size));
2176 return;
2180 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2183 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2184 int32_t res, TDB_DATA outdata,
2185 void *callback)
2187 struct get_tunable_callback_data *cd =
2188 (struct get_tunable_callback_data *)callback;
2190 switch (res) {
2191 case -ETIME:
2192 DEBUG(DEBUG_ERR,
2193 ("Timed out getting tunable \"%s\" from node %d\n",
2194 cd->tunable, pnn));
2195 cd->fatal = true;
2196 break;
2197 case -EINVAL:
2198 case -1:
2199 DEBUG(DEBUG_WARNING,
2200 ("Tunable \"%s\" not implemented on node %d\n",
2201 cd->tunable, pnn));
2202 break;
2203 default:
2204 DEBUG(DEBUG_ERR,
2205 ("Unexpected error getting tunable \"%s\" from node %d\n",
2206 cd->tunable, pnn));
2207 cd->fatal = true;
2211 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2212 TALLOC_CTX *tmp_ctx,
2213 struct ctdb_node_map *nodemap,
2214 const char *tunable,
2215 uint32_t default_value)
2217 TDB_DATA data;
2218 struct ctdb_control_get_tunable *t;
2219 uint32_t *nodes;
2220 uint32_t *tvals;
2221 struct get_tunable_callback_data callback_data;
2222 int i;
2224 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2225 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2226 for (i=0; i<nodemap->num; i++) {
2227 tvals[i] = default_value;
2230 callback_data.out = tvals;
2231 callback_data.tunable = tunable;
2232 callback_data.fatal = false;
2234 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2235 data.dptr = talloc_size(tmp_ctx, data.dsize);
2236 t = (struct ctdb_control_get_tunable *)data.dptr;
2237 t->length = strlen(tunable)+1;
2238 memcpy(t->name, tunable, t->length);
2239 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2240 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2241 nodes, 0, TAKEOVER_TIMEOUT(),
2242 false, data,
2243 get_tunable_callback,
2244 get_tunable_fail_callback,
2245 &callback_data) != 0) {
2246 if (callback_data.fatal) {
2247 talloc_free(tvals);
2248 tvals = NULL;
2251 talloc_free(nodes);
2252 talloc_free(data.dptr);
2254 return tvals;
2257 struct get_runstate_callback_data {
2258 enum ctdb_runstate *out;
2259 bool fatal;
2262 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2263 int32_t res, TDB_DATA outdata,
2264 void *callback_data)
2266 struct get_runstate_callback_data *cd =
2267 (struct get_runstate_callback_data *)callback_data;
2268 int size;
2270 if (res != 0) {
2271 /* Already handled in fail callback */
2272 return;
2275 if (outdata.dsize != sizeof(uint32_t)) {
2276 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2277 pnn, (int)sizeof(uint32_t),
2278 (int)outdata.dsize));
2279 cd->fatal = true;
2280 return;
2283 size = talloc_array_length(cd->out);
2284 if (pnn >= size) {
2285 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2286 pnn, size));
2287 return;
2290 cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2293 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2294 int32_t res, TDB_DATA outdata,
2295 void *callback)
2297 struct get_runstate_callback_data *cd =
2298 (struct get_runstate_callback_data *)callback;
2300 switch (res) {
2301 case -ETIME:
2302 DEBUG(DEBUG_ERR,
2303 ("Timed out getting runstate from node %d\n", pnn));
2304 cd->fatal = true;
2305 break;
2306 default:
2307 DEBUG(DEBUG_WARNING,
2308 ("Error getting runstate from node %d - assuming runstates not supported\n",
2309 pnn));
2313 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2314 TALLOC_CTX *tmp_ctx,
2315 struct ctdb_node_map *nodemap,
2316 enum ctdb_runstate default_value)
2318 uint32_t *nodes;
2319 enum ctdb_runstate *rs;
2320 struct get_runstate_callback_data callback_data;
2321 int i;
2323 rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2324 CTDB_NO_MEMORY_NULL(ctdb, rs);
2325 for (i=0; i<nodemap->num; i++) {
2326 rs[i] = default_value;
2329 callback_data.out = rs;
2330 callback_data.fatal = false;
2332 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2333 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2334 nodes, 0, TAKEOVER_TIMEOUT(),
2335 true, tdb_null,
2336 get_runstate_callback,
2337 get_runstate_fail_callback,
2338 &callback_data) != 0) {
2339 if (callback_data.fatal) {
2340 free(rs);
2341 rs = NULL;
2344 talloc_free(nodes);
2346 return rs;
2349 /* Set internal flags for IP allocation:
2350 * Clear ip flags
2351 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2352 * Set NOIPHOST ip flag for each INACTIVE node
2353 * if all nodes are disabled:
2354 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2355 * else
2356 * Set NOIPHOST ip flags for disabled nodes
2358 static struct ctdb_ipflags *
2359 set_ipflags_internal(struct ctdb_context *ctdb,
2360 TALLOC_CTX *tmp_ctx,
2361 struct ctdb_node_map *nodemap,
2362 uint32_t *tval_noiptakeover,
2363 uint32_t *tval_noiphostonalldisabled,
2364 enum ctdb_runstate *runstate)
2366 int i;
2367 struct ctdb_ipflags *ipflags;
2369 /* Clear IP flags - implicit due to talloc_zero */
2370 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2371 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2373 for (i=0;i<nodemap->num;i++) {
2374 /* Can not take IPs on node with NoIPTakeover set */
2375 if (tval_noiptakeover[i] != 0) {
2376 ipflags[i].noiptakeover = true;
2379 /* Can not host IPs on node not in RUNNING state */
2380 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2381 ipflags[i].noiphost = true;
2382 continue;
2384 /* Can not host IPs on INACTIVE node */
2385 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2386 ipflags[i].noiphost = true;
2388 /* Remember the runstate */
2389 ipflags[i].runstate = runstate[i];
2392 if (all_nodes_are_disabled(nodemap)) {
2393 /* If all nodes are disabled, can not host IPs on node
2394 * with NoIPHostOnAllDisabled set
2396 for (i=0;i<nodemap->num;i++) {
2397 if (tval_noiphostonalldisabled[i] != 0) {
2398 ipflags[i].noiphost = true;
2401 } else {
2402 /* If some nodes are not disabled, then can not host
2403 * IPs on DISABLED node
2405 for (i=0;i<nodemap->num;i++) {
2406 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2407 ipflags[i].noiphost = true;
2412 return ipflags;
2415 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2416 TALLOC_CTX *tmp_ctx,
2417 struct ctdb_node_map *nodemap)
2419 uint32_t *tval_noiptakeover;
2420 uint32_t *tval_noiphostonalldisabled;
2421 struct ctdb_ipflags *ipflags;
2422 enum ctdb_runstate *runstate;
2425 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2426 "NoIPTakeover", 0);
2427 if (tval_noiptakeover == NULL) {
2428 return NULL;
2431 tval_noiphostonalldisabled =
2432 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2433 "NoIPHostOnAllDisabled", 0);
2434 if (tval_noiphostonalldisabled == NULL) {
2435 /* Caller frees tmp_ctx */
2436 return NULL;
2439 /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2440 * will default to CTDB_RUNSTATE_RUNNING. This ensures
2441 * reasonable behaviour on a mixed cluster during upgrade.
2443 runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2444 CTDB_RUNSTATE_RUNNING);
2445 if (runstate == NULL) {
2446 /* Caller frees tmp_ctx */
2447 return NULL;
2450 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2451 tval_noiptakeover,
2452 tval_noiphostonalldisabled,
2453 runstate);
2455 talloc_free(tval_noiptakeover);
2456 talloc_free(tval_noiphostonalldisabled);
2457 talloc_free(runstate);
2459 return ipflags;
2462 struct iprealloc_callback_data {
2463 bool *retry_nodes;
2464 int retry_count;
2465 client_async_callback fail_callback;
2466 void *fail_callback_data;
2467 struct ctdb_node_map *nodemap;
2470 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2471 int32_t res, TDB_DATA outdata,
2472 void *callback)
2474 int numnodes;
2475 struct iprealloc_callback_data *cd =
2476 (struct iprealloc_callback_data *)callback;
2478 numnodes = talloc_array_length(cd->retry_nodes);
2479 if (pnn > numnodes) {
2480 DEBUG(DEBUG_ERR,
2481 ("ipreallocated failure from node %d, "
2482 "but only %d nodes in nodemap\n",
2483 pnn, numnodes));
2484 return;
2487 /* Can't run the "ipreallocated" event on a INACTIVE node */
2488 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2489 DEBUG(DEBUG_WARNING,
2490 ("ipreallocated failed on inactive node %d, ignoring\n",
2491 pnn));
2492 return;
2495 switch (res) {
2496 case -ETIME:
2497 /* If the control timed out then that's a real error,
2498 * so call the real fail callback
2500 if (cd->fail_callback) {
2501 cd->fail_callback(ctdb, pnn, res, outdata,
2502 cd->fail_callback_data);
2503 } else {
2504 DEBUG(DEBUG_WARNING,
2505 ("iprealloc timed out but no callback registered\n"));
2507 break;
2508 default:
2509 /* If not a timeout then either the ipreallocated
2510 * eventscript (or some setup) failed. This might
2511 * have failed because the IPREALLOCATED control isn't
2512 * implemented - right now there is no way of knowing
2513 * because the error codes are all folded down to -1.
2514 * Consider retrying using EVENTSCRIPT control...
2516 DEBUG(DEBUG_WARNING,
2517 ("ipreallocated failure from node %d, flagging retry\n",
2518 pnn));
2519 cd->retry_nodes[pnn] = true;
2520 cd->retry_count++;
2524 struct takeover_callback_data {
2525 bool *node_failed;
2526 client_async_callback fail_callback;
2527 void *fail_callback_data;
2528 struct ctdb_node_map *nodemap;
2531 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2532 uint32_t node_pnn, int32_t res,
2533 TDB_DATA outdata, void *callback_data)
2535 struct takeover_callback_data *cd =
2536 talloc_get_type_abort(callback_data,
2537 struct takeover_callback_data);
2538 int i;
2540 for (i = 0; i < cd->nodemap->num; i++) {
2541 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2542 break;
2546 if (i == cd->nodemap->num) {
2547 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2548 return;
2551 if (!cd->node_failed[i]) {
2552 cd->node_failed[i] = true;
2553 cd->fail_callback(ctdb, node_pnn, res, outdata,
2554 cd->fail_callback_data);
2559 make any IP alias changes for public addresses that are necessary
2561 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2562 uint32_t *force_rebalance_nodes,
2563 client_async_callback fail_callback, void *callback_data)
2565 int i, j, ret;
2566 struct ctdb_public_ip ip;
2567 uint32_t *nodes;
2568 struct ctdb_public_ip_list *all_ips, *tmp_ip;
2569 TDB_DATA data;
2570 struct timeval timeout;
2571 struct client_async_data *async_data;
2572 struct ctdb_client_control_state *state;
2573 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2574 struct ctdb_ipflags *ipflags;
2575 struct takeover_callback_data *takeover_data;
2576 struct iprealloc_callback_data iprealloc_data;
2577 bool *retry_data;
2578 bool can_host_ips;
2581 * ip failover is completely disabled, just send out the
2582 * ipreallocated event.
2584 if (ctdb->tunable.disable_ip_failover != 0) {
2585 goto ipreallocated;
2588 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2589 if (ipflags == NULL) {
2590 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2591 talloc_free(tmp_ctx);
2592 return -1;
2595 /* Short-circuit IP allocation if no nodes are in the RUNNING
2596 * runstate yet, since no nodes will be able to host IPs */
2597 can_host_ips = false;
2598 for (i=0; i<nodemap->num; i++) {
2599 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2600 can_host_ips = true;
2603 if (!can_host_ips) {
2604 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2605 return 0;
2608 /* Do the IP reassignment calculations */
2609 ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2611 /* Now tell all nodes to release any public IPs should not
2612 * host. This will be a NOOP on nodes that don't currently
2613 * hold the given IP.
2615 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2616 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2618 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2619 bool, nodemap->num);
2620 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2621 takeover_data->fail_callback = fail_callback;
2622 takeover_data->fail_callback_data = callback_data;
2623 takeover_data->nodemap = nodemap;
2625 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2626 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2628 async_data->fail_callback = takeover_run_fail_callback;
2629 async_data->callback_data = takeover_data;
2631 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2633 /* Send a RELEASE_IP to all nodes that should not be hosting
2634 * each IP. For each IP, all but one of these will be
2635 * redundant. However, the redundant ones are used to tell
2636 * nodes which node should be hosting the IP so that commands
2637 * like "ctdb ip" can display a particular nodes idea of who
2638 * is hosting what. */
2639 for (i=0;i<nodemap->num;i++) {
2640 /* don't talk to unconnected nodes, but do talk to banned nodes */
2641 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2642 continue;
2645 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2646 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2647 /* This node should be serving this
2648 vnn so dont tell it to release the ip
2650 continue;
2652 ip.pnn = tmp_ip->pnn;
2653 ip.addr = tmp_ip->addr;
2655 timeout = TAKEOVER_TIMEOUT();
2656 data.dsize = sizeof(ip);
2657 data.dptr = (uint8_t *)&ip;
2658 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2659 0, CTDB_CONTROL_RELEASE_IP, 0,
2660 data, async_data,
2661 &timeout, NULL);
2662 if (state == NULL) {
2663 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2664 talloc_free(tmp_ctx);
2665 return -1;
2668 ctdb_client_async_add(async_data, state);
2671 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2672 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2673 talloc_free(tmp_ctx);
2674 return -1;
2676 talloc_free(async_data);
2679 /* For each IP, send a TAKOVER_IP to the node that should be
2680 * hosting it. Many of these will often be redundant (since
2681 * the allocation won't have changed) but they can be useful
2682 * to recover from inconsistencies. */
2683 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2684 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2686 async_data->fail_callback = fail_callback;
2687 async_data->callback_data = callback_data;
2689 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2690 if (tmp_ip->pnn == -1) {
2691 /* this IP won't be taken over */
2692 continue;
2695 ip.pnn = tmp_ip->pnn;
2696 ip.addr = tmp_ip->addr;
2698 timeout = TAKEOVER_TIMEOUT();
2699 data.dsize = sizeof(ip);
2700 data.dptr = (uint8_t *)&ip;
2701 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2702 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2703 data, async_data, &timeout, NULL);
2704 if (state == NULL) {
2705 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2706 talloc_free(tmp_ctx);
2707 return -1;
2710 ctdb_client_async_add(async_data, state);
2712 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2713 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2714 talloc_free(tmp_ctx);
2715 return -1;
2718 ipreallocated:
2720 * Tell all nodes to run eventscripts to process the
2721 * "ipreallocated" event. This can do a lot of things,
2722 * including restarting services to reconfigure them if public
2723 * IPs have moved. Once upon a time this event only used to
2724 * update natgw.
2726 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2727 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2728 iprealloc_data.retry_nodes = retry_data;
2729 iprealloc_data.retry_count = 0;
2730 iprealloc_data.fail_callback = fail_callback;
2731 iprealloc_data.fail_callback_data = callback_data;
2732 iprealloc_data.nodemap = nodemap;
2734 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2735 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2736 nodes, 0, TAKEOVER_TIMEOUT(),
2737 false, tdb_null,
2738 NULL, iprealloc_fail_callback,
2739 &iprealloc_data);
2740 if (ret != 0) {
2741 /* If the control failed then we should retry to any
2742 * nodes flagged by iprealloc_fail_callback using the
2743 * EVENTSCRIPT control. This is a best-effort at
2744 * backward compatiblity when running a mixed cluster
2745 * where some nodes have not yet been upgraded to
2746 * support the IPREALLOCATED control.
2748 DEBUG(DEBUG_WARNING,
2749 ("Retry ipreallocated to some nodes using eventscript control\n"));
2751 nodes = talloc_array(tmp_ctx, uint32_t,
2752 iprealloc_data.retry_count);
2753 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2755 j = 0;
2756 for (i=0; i<nodemap->num; i++) {
2757 if (iprealloc_data.retry_nodes[i]) {
2758 nodes[j] = i;
2759 j++;
2763 data.dptr = discard_const("ipreallocated");
2764 data.dsize = strlen((char *)data.dptr) + 1;
2765 ret = ctdb_client_async_control(ctdb,
2766 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2767 nodes, 0, TAKEOVER_TIMEOUT(),
2768 false, data,
2769 NULL, fail_callback,
2770 callback_data);
2771 if (ret != 0) {
2772 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2776 talloc_free(tmp_ctx);
2777 return ret;
2782 destroy a ctdb_client_ip structure
2784 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2786 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2787 ctdb_addr_to_str(&ip->addr),
2788 ntohs(ip->addr.ip.sin_port),
2789 ip->client_id));
2791 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2792 return 0;
2796 called by a client to inform us of a TCP connection that it is managing
2797 that should tickled with an ACK when IP takeover is done
2799 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2800 TDB_DATA indata)
2802 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2803 struct ctdb_control_tcp_addr *tcp_sock = NULL;
2804 struct ctdb_tcp_list *tcp;
2805 struct ctdb_tcp_connection t;
2806 int ret;
2807 TDB_DATA data;
2808 struct ctdb_client_ip *ip;
2809 struct ctdb_vnn *vnn;
2810 ctdb_sock_addr addr;
2812 /* If we don't have public IPs, tickles are useless */
2813 if (ctdb->vnn == NULL) {
2814 return 0;
2817 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2819 addr = tcp_sock->src;
2820 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2821 addr = tcp_sock->dest;
2822 ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2824 ZERO_STRUCT(addr);
2825 memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2826 vnn = find_public_ip_vnn(ctdb, &addr);
2827 if (vnn == NULL) {
2828 switch (addr.sa.sa_family) {
2829 case AF_INET:
2830 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2831 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2832 ctdb_addr_to_str(&addr)));
2834 break;
2835 case AF_INET6:
2836 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2837 ctdb_addr_to_str(&addr)));
2838 break;
2839 default:
2840 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2843 return 0;
2846 if (vnn->pnn != ctdb->pnn) {
2847 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2848 ctdb_addr_to_str(&addr),
2849 client_id, client->pid));
2850 /* failing this call will tell smbd to die */
2851 return -1;
2854 ip = talloc(client, struct ctdb_client_ip);
2855 CTDB_NO_MEMORY(ctdb, ip);
2857 ip->ctdb = ctdb;
2858 ip->addr = addr;
2859 ip->client_id = client_id;
2860 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2861 DLIST_ADD(ctdb->client_ip_list, ip);
2863 tcp = talloc(client, struct ctdb_tcp_list);
2864 CTDB_NO_MEMORY(ctdb, tcp);
2866 tcp->connection.src_addr = tcp_sock->src;
2867 tcp->connection.dst_addr = tcp_sock->dest;
2869 DLIST_ADD(client->tcp_list, tcp);
2871 t.src_addr = tcp_sock->src;
2872 t.dst_addr = tcp_sock->dest;
2874 data.dptr = (uint8_t *)&t;
2875 data.dsize = sizeof(t);
2877 switch (addr.sa.sa_family) {
2878 case AF_INET:
2879 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2880 (unsigned)ntohs(tcp_sock->dest.ip.sin_port),
2881 ctdb_addr_to_str(&tcp_sock->src),
2882 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2883 break;
2884 case AF_INET6:
2885 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2886 (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port),
2887 ctdb_addr_to_str(&tcp_sock->src),
2888 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2889 break;
2890 default:
2891 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2895 /* tell all nodes about this tcp connection */
2896 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2897 CTDB_CONTROL_TCP_ADD,
2898 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2899 if (ret != 0) {
2900 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2901 return -1;
2904 return 0;
2908 find a tcp address on a list
2910 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2911 struct ctdb_tcp_connection *tcp)
2913 int i;
2915 if (array == NULL) {
2916 return NULL;
2919 for (i=0;i<array->num;i++) {
2920 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2921 ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2922 return &array->connections[i];
2925 return NULL;
2931 called by a daemon to inform us of a TCP connection that one of its
2932 clients managing that should tickled with an ACK when IP takeover is
2933 done
2935 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2937 struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2938 struct ctdb_tcp_array *tcparray;
2939 struct ctdb_tcp_connection tcp;
2940 struct ctdb_vnn *vnn;
2942 /* If we don't have public IPs, tickles are useless */
2943 if (ctdb->vnn == NULL) {
2944 return 0;
2947 vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2948 if (vnn == NULL) {
2949 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2950 ctdb_addr_to_str(&p->dst_addr)));
2952 return -1;
2956 tcparray = vnn->tcp_array;
2958 /* If this is the first tickle */
2959 if (tcparray == NULL) {
2960 tcparray = talloc(vnn, struct ctdb_tcp_array);
2961 CTDB_NO_MEMORY(ctdb, tcparray);
2962 vnn->tcp_array = tcparray;
2964 tcparray->num = 0;
2965 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2966 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2968 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2969 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2970 tcparray->num++;
2972 if (tcp_update_needed) {
2973 vnn->tcp_update_needed = true;
2975 return 0;
2979 /* Do we already have this tickle ?*/
2980 tcp.src_addr = p->src_addr;
2981 tcp.dst_addr = p->dst_addr;
2982 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2983 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2984 ctdb_addr_to_str(&tcp.dst_addr),
2985 ntohs(tcp.dst_addr.ip.sin_port),
2986 vnn->pnn));
2987 return 0;
2990 /* A new tickle, we must add it to the array */
2991 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2992 struct ctdb_tcp_connection,
2993 tcparray->num+1);
2994 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2996 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2997 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2998 tcparray->num++;
3000 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3001 ctdb_addr_to_str(&tcp.dst_addr),
3002 ntohs(tcp.dst_addr.ip.sin_port),
3003 vnn->pnn));
3005 if (tcp_update_needed) {
3006 vnn->tcp_update_needed = true;
3009 return 0;
3014 called by a daemon to inform us of a TCP connection that one of its
3015 clients managing that should tickled with an ACK when IP takeover is
3016 done
3018 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3020 struct ctdb_tcp_connection *tcpp;
3021 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3023 if (vnn == NULL) {
3024 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3025 ctdb_addr_to_str(&conn->dst_addr)));
3026 return;
3029 /* if the array is empty we cant remove it
3030 and we dont need to do anything
3032 if (vnn->tcp_array == NULL) {
3033 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3034 ctdb_addr_to_str(&conn->dst_addr),
3035 ntohs(conn->dst_addr.ip.sin_port)));
3036 return;
3040 /* See if we know this connection
3041 if we dont know this connection then we dont need to do anything
3043 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3044 if (tcpp == NULL) {
3045 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3046 ctdb_addr_to_str(&conn->dst_addr),
3047 ntohs(conn->dst_addr.ip.sin_port)));
3048 return;
3052 /* We need to remove this entry from the array.
3053 Instead of allocating a new array and copying data to it
3054 we cheat and just copy the last entry in the existing array
3055 to the entry that is to be removed and just shring the
3056 ->num field
3058 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3059 vnn->tcp_array->num--;
3061 /* If we deleted the last entry we also need to remove the entire array
3063 if (vnn->tcp_array->num == 0) {
3064 talloc_free(vnn->tcp_array);
3065 vnn->tcp_array = NULL;
3068 vnn->tcp_update_needed = true;
3070 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3071 ctdb_addr_to_str(&conn->src_addr),
3072 ntohs(conn->src_addr.ip.sin_port)));
3077 called by a daemon to inform us of a TCP connection that one of its
3078 clients used are no longer needed in the tickle database
3080 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3082 struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3084 /* If we don't have public IPs, tickles are useless */
3085 if (ctdb->vnn == NULL) {
3086 return 0;
3089 ctdb_remove_tcp_connection(ctdb, conn);
3091 return 0;
3096 Called when another daemon starts - causes all tickles for all
3097 public addresses we are serving to be sent to the new node on the
3098 next check. This actually causes the next scheduled call to
3099 tdb_update_tcp_tickles() to update all nodes. This is simple and
3100 doesn't require careful error handling.
3102 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3104 struct ctdb_vnn *vnn;
3106 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3107 (unsigned long) pnn));
3109 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3110 vnn->tcp_update_needed = true;
3113 return 0;
3118 called when a client structure goes away - hook to remove
3119 elements from the tcp_list in all daemons
3121 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3123 while (client->tcp_list) {
3124 struct ctdb_tcp_list *tcp = client->tcp_list;
3125 DLIST_REMOVE(client->tcp_list, tcp);
3126 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3132 release all IPs on shutdown
3134 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3136 struct ctdb_vnn *vnn;
3137 int count = 0;
3139 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3140 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3141 ctdb_vnn_unassign_iface(ctdb, vnn);
3142 continue;
3144 if (!vnn->iface) {
3145 continue;
3148 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3149 ctdb_addr_to_str(&vnn->public_address),
3150 vnn->public_netmask_bits,
3151 ctdb_vnn_iface_string(vnn)));
3153 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3154 ctdb_vnn_iface_string(vnn),
3155 ctdb_addr_to_str(&vnn->public_address),
3156 vnn->public_netmask_bits);
3157 release_kill_clients(ctdb, &vnn->public_address);
3158 ctdb_vnn_unassign_iface(ctdb, vnn);
3159 count++;
3162 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3167 get list of public IPs
3169 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3170 struct ctdb_req_control *c, TDB_DATA *outdata)
3172 int i, num, len;
3173 struct ctdb_all_public_ips *ips;
3174 struct ctdb_vnn *vnn;
3175 bool only_available = false;
3177 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3178 only_available = true;
3181 /* count how many public ip structures we have */
3182 num = 0;
3183 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3184 num++;
3187 len = offsetof(struct ctdb_all_public_ips, ips) +
3188 num*sizeof(struct ctdb_public_ip);
3189 ips = talloc_zero_size(outdata, len);
3190 CTDB_NO_MEMORY(ctdb, ips);
3192 i = 0;
3193 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3194 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3195 continue;
3197 ips->ips[i].pnn = vnn->pnn;
3198 ips->ips[i].addr = vnn->public_address;
3199 i++;
3201 ips->num = i;
3202 len = offsetof(struct ctdb_all_public_ips, ips) +
3203 i*sizeof(struct ctdb_public_ip);
3205 outdata->dsize = len;
3206 outdata->dptr = (uint8_t *)ips;
3208 return 0;
3212 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3213 struct ctdb_req_control *c,
3214 TDB_DATA indata,
3215 TDB_DATA *outdata)
3217 int i, num, len;
3218 ctdb_sock_addr *addr;
3219 struct ctdb_control_public_ip_info *info;
3220 struct ctdb_vnn *vnn;
3222 addr = (ctdb_sock_addr *)indata.dptr;
3224 vnn = find_public_ip_vnn(ctdb, addr);
3225 if (vnn == NULL) {
3226 /* if it is not a public ip it could be our 'single ip' */
3227 if (ctdb->single_ip_vnn) {
3228 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3229 vnn = ctdb->single_ip_vnn;
3233 if (vnn == NULL) {
3234 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3235 "'%s'not a public address\n",
3236 ctdb_addr_to_str(addr)));
3237 return -1;
3240 /* count how many public ip structures we have */
3241 num = 0;
3242 for (;vnn->ifaces[num];) {
3243 num++;
3246 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3247 num*sizeof(struct ctdb_control_iface_info);
3248 info = talloc_zero_size(outdata, len);
3249 CTDB_NO_MEMORY(ctdb, info);
3251 info->ip.addr = vnn->public_address;
3252 info->ip.pnn = vnn->pnn;
3253 info->active_idx = 0xFFFFFFFF;
3255 for (i=0; vnn->ifaces[i]; i++) {
3256 struct ctdb_iface *cur;
3258 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3259 if (cur == NULL) {
3260 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3261 vnn->ifaces[i]));
3262 return -1;
3264 if (vnn->iface == cur) {
3265 info->active_idx = i;
3267 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3268 info->ifaces[i].link_state = cur->link_up;
3269 info->ifaces[i].references = cur->references;
3271 info->num = i;
3272 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3273 i*sizeof(struct ctdb_control_iface_info);
3275 outdata->dsize = len;
3276 outdata->dptr = (uint8_t *)info;
3278 return 0;
3281 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3282 struct ctdb_req_control *c,
3283 TDB_DATA *outdata)
3285 int i, num, len;
3286 struct ctdb_control_get_ifaces *ifaces;
3287 struct ctdb_iface *cur;
3289 /* count how many public ip structures we have */
3290 num = 0;
3291 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3292 num++;
3295 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3296 num*sizeof(struct ctdb_control_iface_info);
3297 ifaces = talloc_zero_size(outdata, len);
3298 CTDB_NO_MEMORY(ctdb, ifaces);
3300 i = 0;
3301 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3302 strcpy(ifaces->ifaces[i].name, cur->name);
3303 ifaces->ifaces[i].link_state = cur->link_up;
3304 ifaces->ifaces[i].references = cur->references;
3305 i++;
3307 ifaces->num = i;
3308 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3309 i*sizeof(struct ctdb_control_iface_info);
3311 outdata->dsize = len;
3312 outdata->dptr = (uint8_t *)ifaces;
3314 return 0;
3317 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3318 struct ctdb_req_control *c,
3319 TDB_DATA indata)
3321 struct ctdb_control_iface_info *info;
3322 struct ctdb_iface *iface;
3323 bool link_up = false;
3325 info = (struct ctdb_control_iface_info *)indata.dptr;
3327 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3328 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3329 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3330 len, len, info->name));
3331 return -1;
3334 switch (info->link_state) {
3335 case 0:
3336 link_up = false;
3337 break;
3338 case 1:
3339 link_up = true;
3340 break;
3341 default:
3342 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3343 (unsigned int)info->link_state));
3344 return -1;
3347 if (info->references != 0) {
3348 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3349 (unsigned int)info->references));
3350 return -1;
3353 iface = ctdb_find_iface(ctdb, info->name);
3354 if (iface == NULL) {
3355 return -1;
3358 if (link_up == iface->link_up) {
3359 return 0;
3362 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3363 ("iface[%s] has changed it's link status %s => %s\n",
3364 iface->name,
3365 iface->link_up?"up":"down",
3366 link_up?"up":"down"));
3368 iface->link_up = link_up;
3369 return 0;
3374 structure containing the listening socket and the list of tcp connections
3375 that the ctdb daemon is to kill
3377 struct ctdb_kill_tcp {
3378 struct ctdb_vnn *vnn;
3379 struct ctdb_context *ctdb;
3380 int capture_fd;
3381 struct fd_event *fde;
3382 trbt_tree_t *connections;
3383 void *private_data;
3387 a tcp connection that is to be killed
3389 struct ctdb_killtcp_con {
3390 ctdb_sock_addr src_addr;
3391 ctdb_sock_addr dst_addr;
3392 int count;
3393 struct ctdb_kill_tcp *killtcp;
3396 /* this function is used to create a key to represent this socketpair
3397 in the killtcp tree.
3398 this key is used to insert and lookup matching socketpairs that are
3399 to be tickled and RST
3401 #define KILLTCP_KEYLEN 10
3402 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3404 static uint32_t key[KILLTCP_KEYLEN];
3406 bzero(key, sizeof(key));
3408 if (src->sa.sa_family != dst->sa.sa_family) {
3409 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3410 return key;
3413 switch (src->sa.sa_family) {
3414 case AF_INET:
3415 key[0] = dst->ip.sin_addr.s_addr;
3416 key[1] = src->ip.sin_addr.s_addr;
3417 key[2] = dst->ip.sin_port;
3418 key[3] = src->ip.sin_port;
3419 break;
3420 case AF_INET6: {
3421 uint32_t *dst6_addr32 =
3422 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3423 uint32_t *src6_addr32 =
3424 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3425 key[0] = dst6_addr32[3];
3426 key[1] = src6_addr32[3];
3427 key[2] = dst6_addr32[2];
3428 key[3] = src6_addr32[2];
3429 key[4] = dst6_addr32[1];
3430 key[5] = src6_addr32[1];
3431 key[6] = dst6_addr32[0];
3432 key[7] = src6_addr32[0];
3433 key[8] = dst->ip6.sin6_port;
3434 key[9] = src->ip6.sin6_port;
3435 break;
3437 default:
3438 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3439 return key;
3442 return key;
3446 called when we get a read event on the raw socket
3448 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
3449 uint16_t flags, void *private_data)
3451 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3452 struct ctdb_killtcp_con *con;
3453 ctdb_sock_addr src, dst;
3454 uint32_t ack_seq, seq;
3456 if (!(flags & EVENT_FD_READ)) {
3457 return;
3460 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3461 killtcp->private_data,
3462 &src, &dst,
3463 &ack_seq, &seq) != 0) {
3464 /* probably a non-tcp ACK packet */
3465 return;
3468 /* check if we have this guy in our list of connections
3469 to kill
3471 con = trbt_lookuparray32(killtcp->connections,
3472 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3473 if (con == NULL) {
3474 /* no this was some other packet we can just ignore */
3475 return;
3478 /* This one has been tickled !
3479 now reset him and remove him from the list.
3481 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3482 ntohs(con->dst_addr.ip.sin_port),
3483 ctdb_addr_to_str(&con->src_addr),
3484 ntohs(con->src_addr.ip.sin_port)));
3486 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3487 talloc_free(con);
3491 /* when traversing the list of all tcp connections to send tickle acks to
3492 (so that we can capture the ack coming back and kill the connection
3493 by a RST)
3494 this callback is called for each connection we are currently trying to kill
3496 static int tickle_connection_traverse(void *param, void *data)
3498 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3500 /* have tried too many times, just give up */
3501 if (con->count >= 5) {
3502 /* can't delete in traverse: reparent to delete_cons */
3503 talloc_steal(param, con);
3504 return 0;
3507 /* othervise, try tickling it again */
3508 con->count++;
3509 ctdb_sys_send_tcp(
3510 (ctdb_sock_addr *)&con->dst_addr,
3511 (ctdb_sock_addr *)&con->src_addr,
3512 0, 0, 0);
3513 return 0;
3518 called every second until all sentenced connections have been reset
3520 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
3521 struct timeval t, void *private_data)
3523 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3524 void *delete_cons = talloc_new(NULL);
3526 /* loop over all connections sending tickle ACKs */
3527 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3529 /* now we've finished traverse, it's safe to do deletion. */
3530 talloc_free(delete_cons);
3532 /* If there are no more connections to kill we can remove the
3533 entire killtcp structure
3535 if ( (killtcp->connections == NULL) ||
3536 (killtcp->connections->root == NULL) ) {
3537 talloc_free(killtcp);
3538 return;
3541 /* try tickling them again in a seconds time
3543 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3544 ctdb_tickle_sentenced_connections, killtcp);
3548 destroy the killtcp structure
3550 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3552 struct ctdb_vnn *tmpvnn;
3554 /* verify that this vnn is still active */
3555 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3556 if (tmpvnn == killtcp->vnn) {
3557 break;
3561 if (tmpvnn == NULL) {
3562 return 0;
3565 if (killtcp->vnn->killtcp != killtcp) {
3566 return 0;
3569 killtcp->vnn->killtcp = NULL;
3571 return 0;
3575 /* nothing fancy here, just unconditionally replace any existing
3576 connection structure with the new one.
3578 dont even free the old one if it did exist, that one is talloc_stolen
3579 by the same node in the tree anyway and will be deleted when the new data
3580 is deleted
3582 static void *add_killtcp_callback(void *parm, void *data)
3584 return parm;
3588 add a tcp socket to the list of connections we want to RST
3590 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3591 ctdb_sock_addr *s,
3592 ctdb_sock_addr *d)
3594 ctdb_sock_addr src, dst;
3595 struct ctdb_kill_tcp *killtcp;
3596 struct ctdb_killtcp_con *con;
3597 struct ctdb_vnn *vnn;
3599 ctdb_canonicalize_ip(s, &src);
3600 ctdb_canonicalize_ip(d, &dst);
3602 vnn = find_public_ip_vnn(ctdb, &dst);
3603 if (vnn == NULL) {
3604 vnn = find_public_ip_vnn(ctdb, &src);
3606 if (vnn == NULL) {
3607 /* if it is not a public ip it could be our 'single ip' */
3608 if (ctdb->single_ip_vnn) {
3609 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3610 vnn = ctdb->single_ip_vnn;
3614 if (vnn == NULL) {
3615 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3616 return -1;
3619 killtcp = vnn->killtcp;
3621 /* If this is the first connection to kill we must allocate
3622 a new structure
3624 if (killtcp == NULL) {
3625 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3626 CTDB_NO_MEMORY(ctdb, killtcp);
3628 killtcp->vnn = vnn;
3629 killtcp->ctdb = ctdb;
3630 killtcp->capture_fd = -1;
3631 killtcp->connections = trbt_create(killtcp, 0);
3633 vnn->killtcp = killtcp;
3634 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3639 /* create a structure that describes this connection we want to
3640 RST and store it in killtcp->connections
3642 con = talloc(killtcp, struct ctdb_killtcp_con);
3643 CTDB_NO_MEMORY(ctdb, con);
3644 con->src_addr = src;
3645 con->dst_addr = dst;
3646 con->count = 0;
3647 con->killtcp = killtcp;
3650 trbt_insertarray32_callback(killtcp->connections,
3651 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3652 add_killtcp_callback, con);
3655 If we dont have a socket to listen on yet we must create it
3657 if (killtcp->capture_fd == -1) {
3658 const char *iface = ctdb_vnn_iface_string(vnn);
3659 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3660 if (killtcp->capture_fd == -1) {
3661 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3662 "socket on iface '%s' for killtcp (%s)\n",
3663 iface, strerror(errno)));
3664 goto failed;
3669 if (killtcp->fde == NULL) {
3670 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
3671 EVENT_FD_READ,
3672 capture_tcp_handler, killtcp);
3673 tevent_fd_set_auto_close(killtcp->fde);
3675 /* We also need to set up some events to tickle all these connections
3676 until they are all reset
3678 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3679 ctdb_tickle_sentenced_connections, killtcp);
3682 /* tickle him once now */
3683 ctdb_sys_send_tcp(
3684 &con->dst_addr,
3685 &con->src_addr,
3686 0, 0, 0);
3688 return 0;
3690 failed:
3691 talloc_free(vnn->killtcp);
3692 vnn->killtcp = NULL;
3693 return -1;
3697 kill a TCP connection.
3699 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3701 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3703 return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3707 called by a daemon to inform us of the entire list of TCP tickles for
3708 a particular public address.
3709 this control should only be sent by the node that is currently serving
3710 that public address.
3712 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3714 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3715 struct ctdb_tcp_array *tcparray;
3716 struct ctdb_vnn *vnn;
3718 /* We must at least have tickles.num or else we cant verify the size
3719 of the received data blob
3721 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3722 tickles.connections)) {
3723 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3724 return -1;
3727 /* verify that the size of data matches what we expect */
3728 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3729 tickles.connections)
3730 + sizeof(struct ctdb_tcp_connection)
3731 * list->tickles.num) {
3732 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3733 return -1;
3736 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3737 ctdb_addr_to_str(&list->addr)));
3739 vnn = find_public_ip_vnn(ctdb, &list->addr);
3740 if (vnn == NULL) {
3741 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3742 ctdb_addr_to_str(&list->addr)));
3744 return 1;
3747 /* remove any old ticklelist we might have */
3748 talloc_free(vnn->tcp_array);
3749 vnn->tcp_array = NULL;
3751 tcparray = talloc(vnn, struct ctdb_tcp_array);
3752 CTDB_NO_MEMORY(ctdb, tcparray);
3754 tcparray->num = list->tickles.num;
3756 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3757 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3759 memcpy(tcparray->connections, &list->tickles.connections[0],
3760 sizeof(struct ctdb_tcp_connection)*tcparray->num);
3762 /* We now have a new fresh tickle list array for this vnn */
3763 vnn->tcp_array = tcparray;
3765 return 0;
3769 called to return the full list of tickles for the puclic address associated
3770 with the provided vnn
3772 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3774 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3775 struct ctdb_control_tcp_tickle_list *list;
3776 struct ctdb_tcp_array *tcparray;
3777 int num;
3778 struct ctdb_vnn *vnn;
3780 vnn = find_public_ip_vnn(ctdb, addr);
3781 if (vnn == NULL) {
3782 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3783 ctdb_addr_to_str(addr)));
3785 return 1;
3788 tcparray = vnn->tcp_array;
3789 if (tcparray) {
3790 num = tcparray->num;
3791 } else {
3792 num = 0;
3795 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3796 tickles.connections)
3797 + sizeof(struct ctdb_tcp_connection) * num;
3799 outdata->dptr = talloc_size(outdata, outdata->dsize);
3800 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3801 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3803 list->addr = *addr;
3804 list->tickles.num = num;
3805 if (num) {
3806 memcpy(&list->tickles.connections[0], tcparray->connections,
3807 sizeof(struct ctdb_tcp_connection) * num);
3810 return 0;
3815 set the list of all tcp tickles for a public address
3817 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3818 ctdb_sock_addr *addr,
3819 struct ctdb_tcp_array *tcparray)
3821 int ret, num;
3822 TDB_DATA data;
3823 struct ctdb_control_tcp_tickle_list *list;
3825 if (tcparray) {
3826 num = tcparray->num;
3827 } else {
3828 num = 0;
3831 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3832 tickles.connections) +
3833 sizeof(struct ctdb_tcp_connection) * num;
3834 data.dptr = talloc_size(ctdb, data.dsize);
3835 CTDB_NO_MEMORY(ctdb, data.dptr);
3837 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3838 list->addr = *addr;
3839 list->tickles.num = num;
3840 if (tcparray) {
3841 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3844 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3845 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3846 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3847 if (ret != 0) {
3848 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3849 return -1;
3852 talloc_free(data.dptr);
3854 return ret;
3859 perform tickle updates if required
3861 static void ctdb_update_tcp_tickles(struct event_context *ev,
3862 struct timed_event *te,
3863 struct timeval t, void *private_data)
3865 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3866 int ret;
3867 struct ctdb_vnn *vnn;
3869 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3870 /* we only send out updates for public addresses that
3871 we have taken over
3873 if (ctdb->pnn != vnn->pnn) {
3874 continue;
3876 /* We only send out the updates if we need to */
3877 if (!vnn->tcp_update_needed) {
3878 continue;
3880 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3881 &vnn->public_address,
3882 vnn->tcp_array);
3883 if (ret != 0) {
3884 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3885 ctdb_addr_to_str(&vnn->public_address)));
3886 } else {
3887 DEBUG(DEBUG_INFO,
3888 ("Sent tickle update for public address %s\n",
3889 ctdb_addr_to_str(&vnn->public_address)));
3890 vnn->tcp_update_needed = false;
3894 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3895 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3896 ctdb_update_tcp_tickles, ctdb);
3901 start periodic update of tcp tickles
3903 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3905 ctdb->tickle_update_context = talloc_new(ctdb);
3907 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3908 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3909 ctdb_update_tcp_tickles, ctdb);
3915 struct control_gratious_arp {
3916 struct ctdb_context *ctdb;
3917 ctdb_sock_addr addr;
3918 const char *iface;
3919 int count;
3923 send a control_gratuitous arp
3925 static void send_gratious_arp(struct event_context *ev, struct timed_event *te,
3926 struct timeval t, void *private_data)
3928 int ret;
3929 struct control_gratious_arp *arp = talloc_get_type(private_data,
3930 struct control_gratious_arp);
3932 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3933 if (ret != 0) {
3934 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3935 arp->iface, strerror(errno)));
3939 arp->count++;
3940 if (arp->count == CTDB_ARP_REPEAT) {
3941 talloc_free(arp);
3942 return;
3945 event_add_timed(arp->ctdb->ev, arp,
3946 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3947 send_gratious_arp, arp);
3952 send a gratious arp
3954 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3956 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3957 struct control_gratious_arp *arp;
3959 /* verify the size of indata */
3960 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3961 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
3962 (unsigned)indata.dsize,
3963 (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3964 return -1;
3966 if (indata.dsize !=
3967 ( offsetof(struct ctdb_control_gratious_arp, iface)
3968 + gratious_arp->len ) ){
3970 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3971 "but should be %u bytes\n",
3972 (unsigned)indata.dsize,
3973 (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3974 return -1;
3978 arp = talloc(ctdb, struct control_gratious_arp);
3979 CTDB_NO_MEMORY(ctdb, arp);
3981 arp->ctdb = ctdb;
3982 arp->addr = gratious_arp->addr;
3983 arp->iface = talloc_strdup(arp, gratious_arp->iface);
3984 CTDB_NO_MEMORY(ctdb, arp->iface);
3985 arp->count = 0;
3987 event_add_timed(arp->ctdb->ev, arp,
3988 timeval_zero(), send_gratious_arp, arp);
3990 return 0;
3993 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3995 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3996 int ret;
3998 /* verify the size of indata */
3999 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4000 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4001 return -1;
4003 if (indata.dsize !=
4004 ( offsetof(struct ctdb_control_ip_iface, iface)
4005 + pub->len ) ){
4007 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4008 "but should be %u bytes\n",
4009 (unsigned)indata.dsize,
4010 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4011 return -1;
4014 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4016 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4018 if (ret != 0) {
4019 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4020 return -1;
4023 return 0;
4026 struct delete_ip_callback_state {
4027 struct ctdb_req_control *c;
4031 called when releaseip event finishes for del_public_address
4033 static void delete_ip_callback(struct ctdb_context *ctdb,
4034 int32_t status, TDB_DATA data,
4035 const char *errormsg,
4036 void *private_data)
4038 struct delete_ip_callback_state *state =
4039 talloc_get_type(private_data, struct delete_ip_callback_state);
4041 /* If release failed then fail. */
4042 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4043 talloc_free(private_data);
4046 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4047 struct ctdb_req_control *c,
4048 TDB_DATA indata, bool *async_reply)
4050 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4051 struct ctdb_vnn *vnn;
4053 /* verify the size of indata */
4054 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4055 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4056 return -1;
4058 if (indata.dsize !=
4059 ( offsetof(struct ctdb_control_ip_iface, iface)
4060 + pub->len ) ){
4062 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4063 "but should be %u bytes\n",
4064 (unsigned)indata.dsize,
4065 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4066 return -1;
4069 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4071 /* walk over all public addresses until we find a match */
4072 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4073 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4074 if (vnn->pnn == ctdb->pnn) {
4075 struct delete_ip_callback_state *state;
4076 struct ctdb_public_ip *ip;
4077 TDB_DATA data;
4078 int ret;
4080 vnn->delete_pending = true;
4082 state = talloc(ctdb,
4083 struct delete_ip_callback_state);
4084 CTDB_NO_MEMORY(ctdb, state);
4085 state->c = c;
4087 ip = talloc(state, struct ctdb_public_ip);
4088 if (ip == NULL) {
4089 DEBUG(DEBUG_ERR,
4090 (__location__ " Out of memory\n"));
4091 talloc_free(state);
4092 return -1;
4094 ip->pnn = -1;
4095 ip->addr = pub->addr;
4097 data.dsize = sizeof(struct ctdb_public_ip);
4098 data.dptr = (unsigned char *)ip;
4100 ret = ctdb_daemon_send_control(ctdb,
4101 ctdb_get_pnn(ctdb),
4103 CTDB_CONTROL_RELEASE_IP,
4104 0, 0,
4105 data,
4106 delete_ip_callback,
4107 state);
4108 if (ret == -1) {
4109 DEBUG(DEBUG_ERR,
4110 (__location__ "Unable to send "
4111 "CTDB_CONTROL_RELEASE_IP\n"));
4112 talloc_free(state);
4113 return -1;
4116 state->c = talloc_steal(state, c);
4117 *async_reply = true;
4118 } else {
4119 /* This IP is not hosted on the
4120 * current node so just delete it
4121 * now. */
4122 do_delete_ip(ctdb, vnn);
4125 return 0;
4129 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4130 ctdb_addr_to_str(&pub->addr)));
4131 return -1;
4135 struct ipreallocated_callback_state {
4136 struct ctdb_req_control *c;
4139 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4140 int status, void *p)
4142 struct ipreallocated_callback_state *state =
4143 talloc_get_type(p, struct ipreallocated_callback_state);
4145 if (status != 0) {
4146 DEBUG(DEBUG_ERR,
4147 (" \"ipreallocated\" event script failed (status %d)\n",
4148 status));
4149 if (status == -ETIME) {
4150 ctdb_ban_self(ctdb);
4154 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4155 talloc_free(state);
4158 /* A control to run the ipreallocated event */
4159 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4160 struct ctdb_req_control *c,
4161 bool *async_reply)
4163 int ret;
4164 struct ipreallocated_callback_state *state;
4166 state = talloc(ctdb, struct ipreallocated_callback_state);
4167 CTDB_NO_MEMORY(ctdb, state);
4169 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4171 ret = ctdb_event_script_callback(ctdb, state,
4172 ctdb_ipreallocated_callback, state,
4173 CTDB_EVENT_IPREALLOCATED,
4174 "%s", "");
4176 if (ret != 0) {
4177 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4178 talloc_free(state);
4179 return -1;
4182 /* tell the control that we will be reply asynchronously */
4183 state->c = talloc_steal(state, c);
4184 *async_reply = true;
4186 return 0;
4190 /* This function is called from the recovery daemon to verify that a remote
4191 node has the expected ip allocation.
4192 This is verified against ctdb->ip_tree
4194 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4195 struct ctdb_all_public_ips *ips,
4196 uint32_t pnn)
4198 struct ctdb_public_ip_list *tmp_ip;
4199 int i;
4201 if (ctdb->ip_tree == NULL) {
4202 /* dont know the expected allocation yet, assume remote node
4203 is correct. */
4204 return 0;
4207 if (ips == NULL) {
4208 return 0;
4211 for (i=0; i<ips->num; i++) {
4212 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4213 if (tmp_ip == NULL) {
4214 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4215 return -1;
4218 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4219 continue;
4222 if (tmp_ip->pnn != ips->ips[i].pnn) {
4223 DEBUG(DEBUG_ERR,
4224 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4225 pnn,
4226 ctdb_addr_to_str(&ips->ips[i].addr),
4227 ips->ips[i].pnn, tmp_ip->pnn));
4228 return -1;
4232 return 0;
4235 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4237 struct ctdb_public_ip_list *tmp_ip;
4239 if (ctdb->ip_tree == NULL) {
4240 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4241 return -1;
4244 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4245 if (tmp_ip == NULL) {
4246 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4247 return -1;
4250 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4251 tmp_ip->pnn = ip->pnn;
4253 return 0;
4257 struct ctdb_reloadips_handle {
4258 struct ctdb_context *ctdb;
4259 struct ctdb_req_control *c;
4260 int status;
4261 int fd[2];
4262 pid_t child;
4263 struct fd_event *fde;
4266 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4268 if (h == h->ctdb->reload_ips) {
4269 h->ctdb->reload_ips = NULL;
4271 if (h->c != NULL) {
4272 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4273 h->c = NULL;
4275 ctdb_kill(h->ctdb, h->child, SIGKILL);
4276 return 0;
4279 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4280 struct timed_event *te,
4281 struct timeval t, void *private_data)
4283 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4285 talloc_free(h);
4288 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde,
4289 uint16_t flags, void *private_data)
4291 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4293 char res;
4294 int ret;
4296 ret = sys_read(h->fd[0], &res, 1);
4297 if (ret < 1 || res != 0) {
4298 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4299 res = 1;
4301 h->status = res;
4303 talloc_free(h);
4306 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4308 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4309 struct ctdb_all_public_ips *ips;
4310 struct ctdb_vnn *vnn;
4311 struct client_async_data *async_data;
4312 struct timeval timeout;
4313 TDB_DATA data;
4314 struct ctdb_client_control_state *state;
4315 bool first_add;
4316 int i, ret;
4318 CTDB_NO_MEMORY(ctdb, mem_ctx);
4320 /* Read IPs from local node */
4321 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4322 CTDB_CURRENT_NODE, mem_ctx, &ips);
4323 if (ret != 0) {
4324 DEBUG(DEBUG_ERR,
4325 ("Unable to fetch public IPs from local node\n"));
4326 talloc_free(mem_ctx);
4327 return -1;
4330 /* Read IPs file - this is safe since this is a child process */
4331 ctdb->vnn = NULL;
4332 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4333 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4334 talloc_free(mem_ctx);
4335 return -1;
4338 async_data = talloc_zero(mem_ctx, struct client_async_data);
4339 CTDB_NO_MEMORY(ctdb, async_data);
4341 /* Compare IPs between node and file for IPs to be deleted */
4342 for (i = 0; i < ips->num; i++) {
4343 /* */
4344 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4345 if (ctdb_same_ip(&vnn->public_address,
4346 &ips->ips[i].addr)) {
4347 /* IP is still in file */
4348 break;
4352 if (vnn == NULL) {
4353 /* Delete IP ips->ips[i] */
4354 struct ctdb_control_ip_iface *pub;
4356 DEBUG(DEBUG_NOTICE,
4357 ("IP %s no longer configured, deleting it\n",
4358 ctdb_addr_to_str(&ips->ips[i].addr)));
4360 pub = talloc_zero(mem_ctx,
4361 struct ctdb_control_ip_iface);
4362 CTDB_NO_MEMORY(ctdb, pub);
4364 pub->addr = ips->ips[i].addr;
4365 pub->mask = 0;
4366 pub->len = 0;
4368 timeout = TAKEOVER_TIMEOUT();
4370 data.dsize = offsetof(struct ctdb_control_ip_iface,
4371 iface) + pub->len;
4372 data.dptr = (uint8_t *)pub;
4374 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4375 CTDB_CONTROL_DEL_PUBLIC_IP,
4376 0, data, async_data,
4377 &timeout, NULL);
4378 if (state == NULL) {
4379 DEBUG(DEBUG_ERR,
4380 (__location__
4381 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4382 goto failed;
4385 ctdb_client_async_add(async_data, state);
4389 /* Compare IPs between node and file for IPs to be added */
4390 first_add = true;
4391 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4392 for (i = 0; i < ips->num; i++) {
4393 if (ctdb_same_ip(&vnn->public_address,
4394 &ips->ips[i].addr)) {
4395 /* IP already on node */
4396 break;
4399 if (i == ips->num) {
4400 /* Add IP ips->ips[i] */
4401 struct ctdb_control_ip_iface *pub;
4402 const char *ifaces = NULL;
4403 uint32_t len;
4404 int iface = 0;
4406 DEBUG(DEBUG_NOTICE,
4407 ("New IP %s configured, adding it\n",
4408 ctdb_addr_to_str(&vnn->public_address)));
4409 if (first_add) {
4410 uint32_t pnn = ctdb_get_pnn(ctdb);
4412 data.dsize = sizeof(pnn);
4413 data.dptr = (uint8_t *)&pnn;
4415 ret = ctdb_client_send_message(
4416 ctdb,
4417 CTDB_BROADCAST_CONNECTED,
4418 CTDB_SRVID_REBALANCE_NODE,
4419 data);
4420 if (ret != 0) {
4421 DEBUG(DEBUG_WARNING,
4422 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4425 first_add = false;
4428 ifaces = vnn->ifaces[0];
4429 iface = 1;
4430 while (vnn->ifaces[iface] != NULL) {
4431 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4432 vnn->ifaces[iface]);
4433 iface++;
4436 len = strlen(ifaces) + 1;
4437 pub = talloc_zero_size(mem_ctx,
4438 offsetof(struct ctdb_control_ip_iface, iface) + len);
4439 CTDB_NO_MEMORY(ctdb, pub);
4441 pub->addr = vnn->public_address;
4442 pub->mask = vnn->public_netmask_bits;
4443 pub->len = len;
4444 memcpy(&pub->iface[0], ifaces, pub->len);
4446 timeout = TAKEOVER_TIMEOUT();
4448 data.dsize = offsetof(struct ctdb_control_ip_iface,
4449 iface) + pub->len;
4450 data.dptr = (uint8_t *)pub;
4452 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4453 CTDB_CONTROL_ADD_PUBLIC_IP,
4454 0, data, async_data,
4455 &timeout, NULL);
4456 if (state == NULL) {
4457 DEBUG(DEBUG_ERR,
4458 (__location__
4459 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4460 goto failed;
4463 ctdb_client_async_add(async_data, state);
4467 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4468 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4469 goto failed;
4472 talloc_free(mem_ctx);
4473 return 0;
4475 failed:
4476 talloc_free(mem_ctx);
4477 return -1;
4480 /* This control is sent to force the node to re-read the public addresses file
4481 and drop any addresses we should nnot longer host, and add new addresses
4482 that we are now able to host
4484 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4486 struct ctdb_reloadips_handle *h;
4487 pid_t parent = getpid();
4489 if (ctdb->reload_ips != NULL) {
4490 talloc_free(ctdb->reload_ips);
4491 ctdb->reload_ips = NULL;
4494 h = talloc(ctdb, struct ctdb_reloadips_handle);
4495 CTDB_NO_MEMORY(ctdb, h);
4496 h->ctdb = ctdb;
4497 h->c = NULL;
4498 h->status = -1;
4500 if (pipe(h->fd) == -1) {
4501 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4502 talloc_free(h);
4503 return -1;
4506 h->child = ctdb_fork(ctdb);
4507 if (h->child == (pid_t)-1) {
4508 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4509 close(h->fd[0]);
4510 close(h->fd[1]);
4511 talloc_free(h);
4512 return -1;
4515 /* child process */
4516 if (h->child == 0) {
4517 signed char res = 0;
4519 close(h->fd[0]);
4520 debug_extra = talloc_asprintf(NULL, "reloadips:");
4522 ctdb_set_process_name("ctdb_reloadips");
4523 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4524 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4525 res = -1;
4526 } else {
4527 res = ctdb_reloadips_child(ctdb);
4528 if (res != 0) {
4529 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4533 sys_write(h->fd[1], &res, 1);
4534 /* make sure we die when our parent dies */
4535 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4536 sleep(5);
4538 _exit(0);
4541 h->c = talloc_steal(h, c);
4543 close(h->fd[1]);
4544 set_close_on_exec(h->fd[0]);
4546 talloc_set_destructor(h, ctdb_reloadips_destructor);
4549 h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4550 EVENT_FD_READ, ctdb_reloadips_child_handler,
4551 (void *)h);
4552 tevent_fd_set_auto_close(h->fde);
4554 event_add_timed(ctdb->ev, h,
4555 timeval_current_ofs(120, 0),
4556 ctdb_reloadips_timeout_event, h);
4558 /* we reply later */
4559 *async_reply = true;
4560 return 0;