ctdb/daemon: Make delete IP wait until the IP is released
[Samba.git] / ctdb / server / ctdb_takeover.c
blob9c699be4fff8b7208550b05bdbb32c4aacf954d5
1 /*
2 ctdb ip takeover code
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38 bool noiptakeover;
39 bool noiphost;
42 struct ctdb_iface {
43 struct ctdb_iface *prev, *next;
44 const char *name;
45 bool link_up;
46 uint32_t references;
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
51 if (vnn->iface) {
52 return vnn->iface->name;
55 return "__none__";
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
60 struct ctdb_iface *i;
62 /* Verify that we dont have an entry for this ip yet */
63 for (i=ctdb->ifaces;i;i=i->next) {
64 if (strcmp(i->name, iface) == 0) {
65 return 0;
69 /* create a new structure for this interface */
70 i = talloc_zero(ctdb, struct ctdb_iface);
71 CTDB_NO_MEMORY_FATAL(ctdb, i);
72 i->name = talloc_strdup(i, iface);
73 CTDB_NO_MEMORY(ctdb, i->name);
75 * If link_up defaults to true then IPs can be allocated to a
76 * node during the first recovery. However, then an interface
77 * could have its link marked down during the startup event,
78 * causing the IP to move almost immediately. If link_up
79 * defaults to false then, during normal operation, IPs added
80 * to a new interface can't be assigned until a monitor cycle
81 * has occurred and marked the new interfaces up. This makes
82 * IP allocation unpredictable. The following is a neat
83 * compromise: early in startup link_up defaults to false, so
84 * IPs can't be assigned, and after startup IPs can be
85 * assigned immediately.
87 i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
89 DLIST_ADD(ctdb->ifaces, i);
91 return 0;
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95 const char *name)
97 int n;
99 for (n = 0; vnn->ifaces[n] != NULL; n++) {
100 if (strcmp(name, vnn->ifaces[n]) == 0) {
101 return true;
105 return false;
108 /* If any interfaces now have no possible IPs then delete them. This
109 * implementation is naive (i.e. simple) rather than clever
110 * (i.e. complex). Given that this is run on delip and that operation
111 * is rare, this doesn't need to be efficient - it needs to be
112 * foolproof. One alternative is reference counting, where the logic
113 * is distributed and can, therefore, be broken in multiple places.
114 * Another alternative is to build a red-black tree of interfaces that
115 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116 * once) and then walking ctdb->ifaces once and deleting those not in
117 * the tree. Let's go to one of those if the naive implementation
118 * causes problems... :-)
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121 struct ctdb_vnn *vnn,
122 TALLOC_CTX *mem_ctx)
124 struct ctdb_iface *i;
126 /* For each interface, check if there's an IP using it. */
127 for(i=ctdb->ifaces; i; i=i->next) {
128 struct ctdb_vnn *tv;
129 bool found;
131 /* Only consider interfaces named in the given VNN. */
132 if (!vnn_has_interface_with_name(vnn, i->name)) {
133 continue;
136 /* Is the "single IP" on this interface? */
137 if ((ctdb->single_ip_vnn != NULL) &&
138 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140 /* Found, next interface please... */
141 continue;
143 /* Search for a vnn with this interface. */
144 found = false;
145 for (tv=ctdb->vnn; tv; tv=tv->next) {
146 if (vnn_has_interface_with_name(tv, i->name)) {
147 found = true;
148 break;
152 if (!found) {
153 /* None of the VNNs are using this interface. */
154 DLIST_REMOVE(ctdb->ifaces, i);
155 /* Caller will free mem_ctx when convenient. */
156 talloc_steal(mem_ctx, i);
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163 const char *iface)
165 struct ctdb_iface *i;
167 for (i=ctdb->ifaces;i;i=i->next) {
168 if (strcmp(i->name, iface) == 0) {
169 return i;
173 return NULL;
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177 struct ctdb_vnn *vnn)
179 int i;
180 struct ctdb_iface *cur = NULL;
181 struct ctdb_iface *best = NULL;
183 for (i=0; vnn->ifaces[i]; i++) {
185 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186 if (cur == NULL) {
187 continue;
190 if (!cur->link_up) {
191 continue;
194 if (best == NULL) {
195 best = cur;
196 continue;
199 if (cur->references < best->references) {
200 best = cur;
201 continue;
205 return best;
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209 struct ctdb_vnn *vnn)
211 struct ctdb_iface *best = NULL;
213 if (vnn->iface) {
214 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215 "still assigned to iface '%s'\n",
216 ctdb_addr_to_str(&vnn->public_address),
217 ctdb_vnn_iface_string(vnn)));
218 return 0;
221 best = ctdb_vnn_best_iface(ctdb, vnn);
222 if (best == NULL) {
223 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224 "cannot assign to iface any iface\n",
225 ctdb_addr_to_str(&vnn->public_address)));
226 return -1;
229 vnn->iface = best;
230 best->references++;
231 vnn->pnn = ctdb->pnn;
233 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234 "now assigned to iface '%s' refs[%d]\n",
235 ctdb_addr_to_str(&vnn->public_address),
236 ctdb_vnn_iface_string(vnn),
237 best->references));
238 return 0;
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242 struct ctdb_vnn *vnn)
244 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245 "now unassigned (old iface '%s' refs[%d])\n",
246 ctdb_addr_to_str(&vnn->public_address),
247 ctdb_vnn_iface_string(vnn),
248 vnn->iface?vnn->iface->references:0));
249 if (vnn->iface) {
250 vnn->iface->references--;
252 vnn->iface = NULL;
253 if (vnn->pnn == ctdb->pnn) {
254 vnn->pnn = -1;
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259 struct ctdb_vnn *vnn)
261 int i;
263 if (vnn->delete_pending) {
264 return false;
267 if (vnn->iface && vnn->iface->link_up) {
268 return true;
271 for (i=0; vnn->ifaces[i]; i++) {
272 struct ctdb_iface *cur;
274 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
275 if (cur == NULL) {
276 continue;
279 if (cur->link_up) {
280 return true;
284 return false;
287 struct ctdb_takeover_arp {
288 struct ctdb_context *ctdb;
289 uint32_t count;
290 ctdb_sock_addr addr;
291 struct ctdb_tcp_array *tcparray;
292 struct ctdb_vnn *vnn;
297 lists of tcp endpoints
299 struct ctdb_tcp_list {
300 struct ctdb_tcp_list *prev, *next;
301 struct ctdb_tcp_connection connection;
305 list of clients to kill on IP release
307 struct ctdb_client_ip {
308 struct ctdb_client_ip *prev, *next;
309 struct ctdb_context *ctdb;
310 ctdb_sock_addr addr;
311 uint32_t client_id;
316 send a gratuitous arp
318 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
319 struct timeval t, void *private_data)
321 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
322 struct ctdb_takeover_arp);
323 int i, ret;
324 struct ctdb_tcp_array *tcparray;
325 const char *iface = ctdb_vnn_iface_string(arp->vnn);
327 ret = ctdb_sys_send_arp(&arp->addr, iface);
328 if (ret != 0) {
329 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
330 iface, strerror(errno)));
333 tcparray = arp->tcparray;
334 if (tcparray) {
335 for (i=0;i<tcparray->num;i++) {
336 struct ctdb_tcp_connection *tcon;
338 tcon = &tcparray->connections[i];
339 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
340 (unsigned)ntohs(tcon->dst_addr.ip.sin_port),
341 ctdb_addr_to_str(&tcon->src_addr),
342 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
343 ret = ctdb_sys_send_tcp(
344 &tcon->src_addr,
345 &tcon->dst_addr,
346 0, 0, 0);
347 if (ret != 0) {
348 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
349 ctdb_addr_to_str(&tcon->src_addr)));
354 arp->count++;
356 if (arp->count == CTDB_ARP_REPEAT) {
357 talloc_free(arp);
358 return;
361 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
362 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
363 ctdb_control_send_arp, arp);
366 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
367 struct ctdb_vnn *vnn)
369 struct ctdb_takeover_arp *arp;
370 struct ctdb_tcp_array *tcparray;
372 if (!vnn->takeover_ctx) {
373 vnn->takeover_ctx = talloc_new(vnn);
374 if (!vnn->takeover_ctx) {
375 return -1;
379 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
380 if (!arp) {
381 return -1;
384 arp->ctdb = ctdb;
385 arp->addr = vnn->public_address;
386 arp->vnn = vnn;
388 tcparray = vnn->tcp_array;
389 if (tcparray) {
390 /* add all of the known tcp connections for this IP to the
391 list of tcp connections to send tickle acks for */
392 arp->tcparray = talloc_steal(arp, tcparray);
394 vnn->tcp_array = NULL;
395 vnn->tcp_update_needed = true;
398 event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
399 timeval_zero(), ctdb_control_send_arp, arp);
401 return 0;
404 struct takeover_callback_state {
405 struct ctdb_req_control *c;
406 ctdb_sock_addr *addr;
407 struct ctdb_vnn *vnn;
410 struct ctdb_do_takeip_state {
411 struct ctdb_req_control *c;
412 struct ctdb_vnn *vnn;
416 called when takeip event finishes
418 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
419 void *private_data)
421 struct ctdb_do_takeip_state *state =
422 talloc_get_type(private_data, struct ctdb_do_takeip_state);
423 int32_t ret;
424 TDB_DATA data;
426 if (status != 0) {
427 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
429 if (status == -ETIME) {
430 ctdb_ban_self(ctdb);
432 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
433 ctdb_addr_to_str(&state->vnn->public_address),
434 ctdb_vnn_iface_string(state->vnn)));
435 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
437 node->flags |= NODE_FLAGS_UNHEALTHY;
438 talloc_free(state);
439 return;
442 if (ctdb->do_checkpublicip) {
444 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
445 if (ret != 0) {
446 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
447 talloc_free(state);
448 return;
453 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
454 data.dsize = strlen((char *)data.dptr) + 1;
455 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
457 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
460 /* the control succeeded */
461 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
462 talloc_free(state);
463 return;
466 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
468 state->vnn->update_in_flight = false;
469 return 0;
473 take over an ip address
475 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
476 struct ctdb_req_control *c,
477 struct ctdb_vnn *vnn)
479 int ret;
480 struct ctdb_do_takeip_state *state;
482 if (vnn->update_in_flight) {
483 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
484 "update for this IP already in flight\n",
485 ctdb_addr_to_str(&vnn->public_address),
486 vnn->public_netmask_bits));
487 return -1;
490 ret = ctdb_vnn_assign_iface(ctdb, vnn);
491 if (ret != 0) {
492 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
493 "assign a usable interface\n",
494 ctdb_addr_to_str(&vnn->public_address),
495 vnn->public_netmask_bits));
496 return -1;
499 state = talloc(vnn, struct ctdb_do_takeip_state);
500 CTDB_NO_MEMORY(ctdb, state);
502 state->c = talloc_steal(ctdb, c);
503 state->vnn = vnn;
505 vnn->update_in_flight = true;
506 talloc_set_destructor(state, ctdb_takeip_destructor);
508 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
509 ctdb_addr_to_str(&vnn->public_address),
510 vnn->public_netmask_bits,
511 ctdb_vnn_iface_string(vnn)));
513 ret = ctdb_event_script_callback(ctdb,
514 state,
515 ctdb_do_takeip_callback,
516 state,
517 CTDB_EVENT_TAKE_IP,
518 "%s %s %u",
519 ctdb_vnn_iface_string(vnn),
520 ctdb_addr_to_str(&vnn->public_address),
521 vnn->public_netmask_bits);
523 if (ret != 0) {
524 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
525 ctdb_addr_to_str(&vnn->public_address),
526 ctdb_vnn_iface_string(vnn)));
527 talloc_free(state);
528 return -1;
531 return 0;
534 struct ctdb_do_updateip_state {
535 struct ctdb_req_control *c;
536 struct ctdb_iface *old;
537 struct ctdb_vnn *vnn;
541 called when updateip event finishes
543 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
544 void *private_data)
546 struct ctdb_do_updateip_state *state =
547 talloc_get_type(private_data, struct ctdb_do_updateip_state);
548 int32_t ret;
550 if (status != 0) {
551 if (status == -ETIME) {
552 ctdb_ban_self(ctdb);
554 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
555 ctdb_addr_to_str(&state->vnn->public_address),
556 state->old->name,
557 ctdb_vnn_iface_string(state->vnn)));
560 * All we can do is reset the old interface
561 * and let the next run fix it
563 ctdb_vnn_unassign_iface(ctdb, state->vnn);
564 state->vnn->iface = state->old;
565 state->vnn->iface->references++;
567 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
568 talloc_free(state);
569 return;
572 if (ctdb->do_checkpublicip) {
574 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
575 if (ret != 0) {
576 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
577 talloc_free(state);
578 return;
583 /* the control succeeded */
584 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
585 talloc_free(state);
586 return;
589 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
591 state->vnn->update_in_flight = false;
592 return 0;
596 update (move) an ip address
598 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
599 struct ctdb_req_control *c,
600 struct ctdb_vnn *vnn)
602 int ret;
603 struct ctdb_do_updateip_state *state;
604 struct ctdb_iface *old = vnn->iface;
605 const char *new_name;
607 if (vnn->update_in_flight) {
608 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
609 "update for this IP already in flight\n",
610 ctdb_addr_to_str(&vnn->public_address),
611 vnn->public_netmask_bits));
612 return -1;
615 ctdb_vnn_unassign_iface(ctdb, vnn);
616 ret = ctdb_vnn_assign_iface(ctdb, vnn);
617 if (ret != 0) {
618 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
619 "assin a usable interface (old iface '%s')\n",
620 ctdb_addr_to_str(&vnn->public_address),
621 vnn->public_netmask_bits,
622 old->name));
623 return -1;
626 new_name = ctdb_vnn_iface_string(vnn);
627 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
628 /* A benign update from one interface onto itself.
629 * no need to run the eventscripts in this case, just return
630 * success.
632 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
633 return 0;
636 state = talloc(vnn, struct ctdb_do_updateip_state);
637 CTDB_NO_MEMORY(ctdb, state);
639 state->c = talloc_steal(ctdb, c);
640 state->old = old;
641 state->vnn = vnn;
643 vnn->update_in_flight = true;
644 talloc_set_destructor(state, ctdb_updateip_destructor);
646 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
647 "interface %s to %s\n",
648 ctdb_addr_to_str(&vnn->public_address),
649 vnn->public_netmask_bits,
650 old->name,
651 new_name));
653 ret = ctdb_event_script_callback(ctdb,
654 state,
655 ctdb_do_updateip_callback,
656 state,
657 CTDB_EVENT_UPDATE_IP,
658 "%s %s %s %u",
659 state->old->name,
660 new_name,
661 ctdb_addr_to_str(&vnn->public_address),
662 vnn->public_netmask_bits);
663 if (ret != 0) {
664 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
665 ctdb_addr_to_str(&vnn->public_address),
666 old->name, new_name));
667 talloc_free(state);
668 return -1;
671 return 0;
675 Find the vnn of the node that has a public ip address
676 returns -1 if the address is not known as a public address
678 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
680 struct ctdb_vnn *vnn;
682 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
683 if (ctdb_same_ip(&vnn->public_address, addr)) {
684 return vnn;
688 return NULL;
692 take over an ip address
694 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
695 struct ctdb_req_control *c,
696 TDB_DATA indata,
697 bool *async_reply)
699 int ret;
700 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
701 struct ctdb_vnn *vnn;
702 bool have_ip = false;
703 bool do_updateip = false;
704 bool do_takeip = false;
705 struct ctdb_iface *best_iface = NULL;
707 if (pip->pnn != ctdb->pnn) {
708 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
709 "with pnn %d, but we're node %d\n",
710 ctdb_addr_to_str(&pip->addr),
711 pip->pnn, ctdb->pnn));
712 return -1;
715 /* update out vnn list */
716 vnn = find_public_ip_vnn(ctdb, &pip->addr);
717 if (vnn == NULL) {
718 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
719 ctdb_addr_to_str(&pip->addr)));
720 return 0;
723 if (ctdb->do_checkpublicip) {
724 have_ip = ctdb_sys_have_ip(&pip->addr);
726 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
727 if (best_iface == NULL) {
728 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
729 "a usable interface (old %s, have_ip %d)\n",
730 ctdb_addr_to_str(&vnn->public_address),
731 vnn->public_netmask_bits,
732 ctdb_vnn_iface_string(vnn),
733 have_ip));
734 return -1;
737 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
738 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
739 have_ip = false;
743 if (vnn->iface == NULL && have_ip) {
744 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
745 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
746 ctdb_addr_to_str(&vnn->public_address)));
747 return 0;
750 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
751 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
752 "and we have it on iface[%s], but it was assigned to node %d"
753 "and we are node %d, banning ourself\n",
754 ctdb_addr_to_str(&vnn->public_address),
755 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
756 ctdb_ban_self(ctdb);
757 return -1;
760 if (vnn->pnn == -1 && have_ip) {
761 vnn->pnn = ctdb->pnn;
762 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
763 "and we already have it on iface[%s], update local daemon\n",
764 ctdb_addr_to_str(&vnn->public_address),
765 ctdb_vnn_iface_string(vnn)));
766 return 0;
769 if (vnn->iface) {
770 if (vnn->iface != best_iface) {
771 if (!vnn->iface->link_up) {
772 do_updateip = true;
773 } else if (vnn->iface->references > (best_iface->references + 1)) {
774 /* only move when the rebalance gains something */
775 do_updateip = true;
780 if (!have_ip) {
781 if (do_updateip) {
782 ctdb_vnn_unassign_iface(ctdb, vnn);
783 do_updateip = false;
785 do_takeip = true;
788 if (do_takeip) {
789 ret = ctdb_do_takeip(ctdb, c, vnn);
790 if (ret != 0) {
791 return -1;
793 } else if (do_updateip) {
794 ret = ctdb_do_updateip(ctdb, c, vnn);
795 if (ret != 0) {
796 return -1;
798 } else {
800 * The interface is up and the kernel known the ip
801 * => do nothing
803 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
804 ctdb_addr_to_str(&pip->addr),
805 vnn->public_netmask_bits,
806 ctdb_vnn_iface_string(vnn)));
807 return 0;
810 /* tell ctdb_control.c that we will be replying asynchronously */
811 *async_reply = true;
813 return 0;
817 takeover an ip address old v4 style
819 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb,
820 struct ctdb_req_control *c,
821 TDB_DATA indata,
822 bool *async_reply)
824 TDB_DATA data;
826 data.dsize = sizeof(struct ctdb_public_ip);
827 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
828 CTDB_NO_MEMORY(ctdb, data.dptr);
830 memcpy(data.dptr, indata.dptr, indata.dsize);
831 return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
835 kill any clients that are registered with a IP that is being released
837 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
839 struct ctdb_client_ip *ip;
841 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
842 ctdb_addr_to_str(addr)));
844 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
845 ctdb_sock_addr tmp_addr;
847 tmp_addr = ip->addr;
848 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
849 ip->client_id,
850 ctdb_addr_to_str(&ip->addr)));
852 if (ctdb_same_ip(&tmp_addr, addr)) {
853 struct ctdb_client *client = ctdb_reqid_find(ctdb,
854 ip->client_id,
855 struct ctdb_client);
856 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
857 ip->client_id,
858 ctdb_addr_to_str(&ip->addr),
859 client->pid));
861 if (client->pid != 0) {
862 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
863 (unsigned)client->pid,
864 ctdb_addr_to_str(addr),
865 ip->client_id));
866 kill(client->pid, SIGKILL);
872 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
874 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
876 DLIST_REMOVE(ctdb->vnn, vnn);
877 ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
878 ctdb_vnn_unassign_iface(ctdb, vnn);
879 talloc_free(vnn);
880 talloc_free(mem_ctx);
884 called when releaseip event finishes
886 static void release_ip_callback(struct ctdb_context *ctdb, int status,
887 void *private_data)
889 struct takeover_callback_state *state =
890 talloc_get_type(private_data, struct takeover_callback_state);
891 TDB_DATA data;
893 if (status == -ETIME) {
894 ctdb_ban_self(ctdb);
897 if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
898 DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
899 ctdb_addr_to_str(state->addr)));
900 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
901 talloc_free(state);
902 return;
905 /* send a message to all clients of this node telling them
906 that the cluster has been reconfigured and they should
907 release any sockets on this IP */
908 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
909 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
910 data.dsize = strlen((char *)data.dptr)+1;
912 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
914 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
916 /* kill clients that have registered with this IP */
917 release_kill_clients(ctdb, state->addr);
919 ctdb_vnn_unassign_iface(ctdb, state->vnn);
921 /* Process the IP if it has been marked for deletion */
922 if (state->vnn->delete_pending) {
923 do_delete_ip(ctdb, state->vnn);
924 state->vnn = NULL;
927 /* the control succeeded */
928 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
929 talloc_free(state);
932 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
934 if (state->vnn != NULL) {
935 state->vnn->update_in_flight = false;
937 return 0;
941 release an ip address
943 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
944 struct ctdb_req_control *c,
945 TDB_DATA indata,
946 bool *async_reply)
948 int ret;
949 struct takeover_callback_state *state;
950 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
951 struct ctdb_vnn *vnn;
952 char *iface;
954 /* update our vnn list */
955 vnn = find_public_ip_vnn(ctdb, &pip->addr);
956 if (vnn == NULL) {
957 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
958 ctdb_addr_to_str(&pip->addr)));
959 return 0;
961 vnn->pnn = pip->pnn;
963 /* stop any previous arps */
964 talloc_free(vnn->takeover_ctx);
965 vnn->takeover_ctx = NULL;
967 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
968 * lazy multicast to drop an IP from any node that isn't the
969 * intended new node. The following causes makes ctdbd ignore
970 * a release for any address it doesn't host.
972 if (ctdb->do_checkpublicip) {
973 if (!ctdb_sys_have_ip(&pip->addr)) {
974 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
975 ctdb_addr_to_str(&pip->addr),
976 vnn->public_netmask_bits,
977 ctdb_vnn_iface_string(vnn)));
978 ctdb_vnn_unassign_iface(ctdb, vnn);
979 return 0;
981 } else {
982 if (vnn->iface == NULL) {
983 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
984 ctdb_addr_to_str(&pip->addr),
985 vnn->public_netmask_bits));
986 return 0;
990 /* There is a potential race between take_ip and us because we
991 * update the VNN via a callback that run when the
992 * eventscripts have been run. Avoid the race by allowing one
993 * update to be in flight at a time.
995 if (vnn->update_in_flight) {
996 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
997 "update for this IP already in flight\n",
998 ctdb_addr_to_str(&vnn->public_address),
999 vnn->public_netmask_bits));
1000 return -1;
1003 if (ctdb->do_checkpublicip) {
1004 iface = ctdb_sys_find_ifname(&pip->addr);
1005 if (iface == NULL) {
1006 DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
1007 return 0;
1009 if (vnn->iface == NULL) {
1010 DEBUG(DEBUG_WARNING,
1011 ("Public IP %s is hosted on interface %s but we have no VNN\n",
1012 ctdb_addr_to_str(&pip->addr),
1013 iface));
1014 } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
1015 DEBUG(DEBUG_WARNING,
1016 ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
1017 ctdb_addr_to_str(&pip->addr),
1018 iface,
1019 ctdb_vnn_iface_string(vnn)));
1020 /* Should we fix vnn->iface? If we do, what
1021 * happens to reference counts?
1024 } else {
1025 iface = strdup(ctdb_vnn_iface_string(vnn));
1028 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
1029 ctdb_addr_to_str(&pip->addr),
1030 vnn->public_netmask_bits,
1031 iface,
1032 pip->pnn));
1034 state = talloc(ctdb, struct takeover_callback_state);
1035 CTDB_NO_MEMORY(ctdb, state);
1037 state->c = talloc_steal(state, c);
1038 state->addr = talloc(state, ctdb_sock_addr);
1039 CTDB_NO_MEMORY(ctdb, state->addr);
1040 *state->addr = pip->addr;
1041 state->vnn = vnn;
1043 vnn->update_in_flight = true;
1044 talloc_set_destructor(state, ctdb_releaseip_destructor);
1046 ret = ctdb_event_script_callback(ctdb,
1047 state, release_ip_callback, state,
1048 CTDB_EVENT_RELEASE_IP,
1049 "%s %s %u",
1050 iface,
1051 ctdb_addr_to_str(&pip->addr),
1052 vnn->public_netmask_bits);
1053 free(iface);
1054 if (ret != 0) {
1055 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1056 ctdb_addr_to_str(&pip->addr),
1057 ctdb_vnn_iface_string(vnn)));
1058 talloc_free(state);
1059 return -1;
1062 /* tell the control that we will be reply asynchronously */
1063 *async_reply = true;
1064 return 0;
1068 release an ip address old v4 style
1070 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb,
1071 struct ctdb_req_control *c,
1072 TDB_DATA indata,
1073 bool *async_reply)
1075 TDB_DATA data;
1077 data.dsize = sizeof(struct ctdb_public_ip);
1078 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1079 CTDB_NO_MEMORY(ctdb, data.dptr);
1081 memcpy(data.dptr, indata.dptr, indata.dsize);
1082 return ctdb_control_release_ip(ctdb, c, data, async_reply);
1086 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1087 ctdb_sock_addr *addr,
1088 unsigned mask, const char *ifaces,
1089 bool check_address)
1091 struct ctdb_vnn *vnn;
1092 uint32_t num = 0;
1093 char *tmp;
1094 const char *iface;
1095 int i;
1096 int ret;
1098 tmp = strdup(ifaces);
1099 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1100 if (!ctdb_sys_check_iface_exists(iface)) {
1101 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1102 free(tmp);
1103 return -1;
1106 free(tmp);
1108 /* Verify that we dont have an entry for this ip yet */
1109 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1110 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1111 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1112 ctdb_addr_to_str(addr)));
1113 return -1;
1117 /* create a new vnn structure for this ip address */
1118 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1119 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1120 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1121 tmp = talloc_strdup(vnn, ifaces);
1122 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1123 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1124 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1125 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1126 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1127 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1128 num++;
1130 talloc_free(tmp);
1131 vnn->ifaces[num] = NULL;
1132 vnn->public_address = *addr;
1133 vnn->public_netmask_bits = mask;
1134 vnn->pnn = -1;
1135 if (check_address) {
1136 if (ctdb_sys_have_ip(addr)) {
1137 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1138 vnn->pnn = ctdb->pnn;
1142 for (i=0; vnn->ifaces[i]; i++) {
1143 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1144 if (ret != 0) {
1145 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1146 "for public_address[%s]\n",
1147 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1148 talloc_free(vnn);
1149 return -1;
1153 DLIST_ADD(ctdb->vnn, vnn);
1155 return 0;
1158 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te,
1159 struct timeval t, void *private_data)
1161 struct ctdb_context *ctdb = talloc_get_type(private_data,
1162 struct ctdb_context);
1163 struct ctdb_vnn *vnn;
1165 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1166 int i;
1168 for (i=0; vnn->ifaces[i] != NULL; i++) {
1169 if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1170 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1171 vnn->ifaces[i],
1172 ctdb_addr_to_str(&vnn->public_address)));
1177 event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx,
1178 timeval_current_ofs(30, 0),
1179 ctdb_check_interfaces_event, ctdb);
1183 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1185 if (ctdb->check_public_ifaces_ctx != NULL) {
1186 talloc_free(ctdb->check_public_ifaces_ctx);
1187 ctdb->check_public_ifaces_ctx = NULL;
1190 ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1191 if (ctdb->check_public_ifaces_ctx == NULL) {
1192 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1195 event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx,
1196 timeval_current_ofs(30, 0),
1197 ctdb_check_interfaces_event, ctdb);
1199 return 0;
1204 setup the public address lists from a file
1206 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1208 char **lines;
1209 int nlines;
1210 int i;
1212 lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1213 if (lines == NULL) {
1214 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1215 return -1;
1217 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1218 nlines--;
1221 for (i=0;i<nlines;i++) {
1222 unsigned mask;
1223 ctdb_sock_addr addr;
1224 const char *addrstr;
1225 const char *ifaces;
1226 char *tok, *line;
1228 line = lines[i];
1229 while ((*line == ' ') || (*line == '\t')) {
1230 line++;
1232 if (*line == '#') {
1233 continue;
1235 if (strcmp(line, "") == 0) {
1236 continue;
1238 tok = strtok(line, " \t");
1239 addrstr = tok;
1240 tok = strtok(NULL, " \t");
1241 if (tok == NULL) {
1242 if (NULL == ctdb->default_public_interface) {
1243 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1244 i+1));
1245 talloc_free(lines);
1246 return -1;
1248 ifaces = ctdb->default_public_interface;
1249 } else {
1250 ifaces = tok;
1253 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1254 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1255 talloc_free(lines);
1256 return -1;
1258 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1259 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1260 talloc_free(lines);
1261 return -1;
1266 talloc_free(lines);
1267 return 0;
1270 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1271 const char *iface,
1272 const char *ip)
1274 struct ctdb_vnn *svnn;
1275 struct ctdb_iface *cur = NULL;
1276 bool ok;
1277 int ret;
1279 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1280 CTDB_NO_MEMORY(ctdb, svnn);
1282 svnn->ifaces = talloc_array(svnn, const char *, 2);
1283 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1284 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1285 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1286 svnn->ifaces[1] = NULL;
1288 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1289 if (!ok) {
1290 talloc_free(svnn);
1291 return -1;
1294 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1295 if (ret != 0) {
1296 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1297 "for single_ip[%s]\n",
1298 svnn->ifaces[0],
1299 ctdb_addr_to_str(&svnn->public_address)));
1300 talloc_free(svnn);
1301 return -1;
1304 /* assume the single public ip interface is initially "good" */
1305 cur = ctdb_find_iface(ctdb, iface);
1306 if (cur == NULL) {
1307 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1308 return -1;
1310 cur->link_up = true;
1312 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1313 if (ret != 0) {
1314 talloc_free(svnn);
1315 return -1;
1318 ctdb->single_ip_vnn = svnn;
1319 return 0;
1322 struct ctdb_public_ip_list {
1323 struct ctdb_public_ip_list *next;
1324 uint32_t pnn;
1325 ctdb_sock_addr addr;
1328 /* Given a physical node, return the number of
1329 public addresses that is currently assigned to this node.
1331 static int node_ip_coverage(struct ctdb_context *ctdb,
1332 int32_t pnn,
1333 struct ctdb_public_ip_list *ips)
1335 int num=0;
1337 for (;ips;ips=ips->next) {
1338 if (ips->pnn == pnn) {
1339 num++;
1342 return num;
1346 /* Can the given node host the given IP: is the public IP known to the
1347 * node and is NOIPHOST unset?
1349 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1350 struct ctdb_ipflags ipflags,
1351 struct ctdb_public_ip_list *ip)
1353 struct ctdb_all_public_ips *public_ips;
1354 int i;
1356 if (ipflags.noiphost) {
1357 return false;
1360 public_ips = ctdb->nodes[pnn]->available_public_ips;
1362 if (public_ips == NULL) {
1363 return false;
1366 for (i=0; i<public_ips->num; i++) {
1367 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1368 /* yes, this node can serve this public ip */
1369 return true;
1373 return false;
1376 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1377 struct ctdb_ipflags ipflags,
1378 struct ctdb_public_ip_list *ip)
1380 if (ipflags.noiptakeover) {
1381 return false;
1384 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1387 /* search the node lists list for a node to takeover this ip.
1388 pick the node that currently are serving the least number of ips
1389 so that the ips get spread out evenly.
1391 static int find_takeover_node(struct ctdb_context *ctdb,
1392 struct ctdb_ipflags *ipflags,
1393 struct ctdb_public_ip_list *ip,
1394 struct ctdb_public_ip_list *all_ips)
1396 int pnn, min=0, num;
1397 int i, numnodes;
1399 numnodes = talloc_array_length(ipflags);
1400 pnn = -1;
1401 for (i=0; i<numnodes; i++) {
1402 /* verify that this node can serve this ip */
1403 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1404 /* no it couldnt so skip to the next node */
1405 continue;
1408 num = node_ip_coverage(ctdb, i, all_ips);
1409 /* was this the first node we checked ? */
1410 if (pnn == -1) {
1411 pnn = i;
1412 min = num;
1413 } else {
1414 if (num < min) {
1415 pnn = i;
1416 min = num;
1420 if (pnn == -1) {
1421 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1422 ctdb_addr_to_str(&ip->addr)));
1424 return -1;
1427 ip->pnn = pnn;
1428 return 0;
1431 #define IP_KEYLEN 4
1432 static uint32_t *ip_key(ctdb_sock_addr *ip)
1434 static uint32_t key[IP_KEYLEN];
1436 bzero(key, sizeof(key));
1438 switch (ip->sa.sa_family) {
1439 case AF_INET:
1440 key[3] = htonl(ip->ip.sin_addr.s_addr);
1441 break;
1442 case AF_INET6: {
1443 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1444 key[0] = htonl(s6_a32[0]);
1445 key[1] = htonl(s6_a32[1]);
1446 key[2] = htonl(s6_a32[2]);
1447 key[3] = htonl(s6_a32[3]);
1448 break;
1450 default:
1451 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1452 return key;
1455 return key;
1458 static void *add_ip_callback(void *parm, void *data)
1460 struct ctdb_public_ip_list *this_ip = parm;
1461 struct ctdb_public_ip_list *prev_ip = data;
1463 if (prev_ip == NULL) {
1464 return parm;
1466 if (this_ip->pnn == -1) {
1467 this_ip->pnn = prev_ip->pnn;
1470 return parm;
1473 static int getips_count_callback(void *param, void *data)
1475 struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1476 struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1478 new_ip->next = *ip_list;
1479 *ip_list = new_ip;
1480 return 0;
1483 static struct ctdb_public_ip_list *
1484 create_merged_ip_list(struct ctdb_context *ctdb)
1486 int i, j;
1487 struct ctdb_public_ip_list *ip_list;
1488 struct ctdb_all_public_ips *public_ips;
1490 if (ctdb->ip_tree != NULL) {
1491 talloc_free(ctdb->ip_tree);
1492 ctdb->ip_tree = NULL;
1494 ctdb->ip_tree = trbt_create(ctdb, 0);
1496 for (i=0;i<ctdb->num_nodes;i++) {
1497 public_ips = ctdb->nodes[i]->known_public_ips;
1499 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1500 continue;
1503 /* there were no public ips for this node */
1504 if (public_ips == NULL) {
1505 continue;
1508 for (j=0;j<public_ips->num;j++) {
1509 struct ctdb_public_ip_list *tmp_ip;
1511 tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1512 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1513 /* Do not use information about IP addresses hosted
1514 * on other nodes, it may not be accurate */
1515 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1516 tmp_ip->pnn = public_ips->ips[j].pnn;
1517 } else {
1518 tmp_ip->pnn = -1;
1520 tmp_ip->addr = public_ips->ips[j].addr;
1521 tmp_ip->next = NULL;
1523 trbt_insertarray32_callback(ctdb->ip_tree,
1524 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1525 add_ip_callback,
1526 tmp_ip);
1530 ip_list = NULL;
1531 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1533 return ip_list;
1537 * This is the length of the longtest common prefix between the IPs.
1538 * It is calculated by XOR-ing the 2 IPs together and counting the
1539 * number of leading zeroes. The implementation means that all
1540 * addresses end up being 128 bits long.
1542 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1543 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1544 * lots of nodes and IP addresses?
1546 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1548 uint32_t ip1_k[IP_KEYLEN];
1549 uint32_t *t;
1550 int i;
1551 uint32_t x;
1553 uint32_t distance = 0;
1555 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1556 t = ip_key(ip2);
1557 for (i=0; i<IP_KEYLEN; i++) {
1558 x = ip1_k[i] ^ t[i];
1559 if (x == 0) {
1560 distance += 32;
1561 } else {
1562 /* Count number of leading zeroes.
1563 * FIXME? This could be optimised...
1565 while ((x & (1 << 31)) == 0) {
1566 x <<= 1;
1567 distance += 1;
1572 return distance;
1575 /* Calculate the IP distance for the given IP relative to IPs on the
1576 given node. The ips argument is generally the all_ips variable
1577 used in the main part of the algorithm.
1579 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1580 struct ctdb_public_ip_list *ips,
1581 int pnn)
1583 struct ctdb_public_ip_list *t;
1584 uint32_t d;
1586 uint32_t sum = 0;
1588 for (t=ips; t != NULL; t=t->next) {
1589 if (t->pnn != pnn) {
1590 continue;
1593 /* Optimisation: We never calculate the distance
1594 * between an address and itself. This allows us to
1595 * calculate the effect of removing an address from a
1596 * node by simply calculating the distance between
1597 * that address and all of the exitsing addresses.
1598 * Moreover, we assume that we're only ever dealing
1599 * with addresses from all_ips so we can identify an
1600 * address via a pointer rather than doing a more
1601 * expensive address comparison. */
1602 if (&(t->addr) == ip) {
1603 continue;
1606 d = ip_distance(ip, &(t->addr));
1607 sum += d * d; /* Cheaper than pulling in math.h :-) */
1610 return sum;
1613 /* Return the LCP2 imbalance metric for addresses currently assigned
1614 to the given node.
1616 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1618 struct ctdb_public_ip_list *t;
1620 uint32_t imbalance = 0;
1622 for (t=all_ips; t!=NULL; t=t->next) {
1623 if (t->pnn != pnn) {
1624 continue;
1626 /* Pass the rest of the IPs rather than the whole
1627 all_ips input list.
1629 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1632 return imbalance;
1635 /* Allocate any unassigned IPs just by looping through the IPs and
1636 * finding the best node for each.
1638 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1639 struct ctdb_ipflags *ipflags,
1640 struct ctdb_public_ip_list *all_ips)
1642 struct ctdb_public_ip_list *tmp_ip;
1644 /* loop over all ip's and find a physical node to cover for
1645 each unassigned ip.
1647 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1648 if (tmp_ip->pnn == -1) {
1649 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1650 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1651 ctdb_addr_to_str(&tmp_ip->addr)));
1657 /* Basic non-deterministic rebalancing algorithm.
1659 static void basic_failback(struct ctdb_context *ctdb,
1660 struct ctdb_ipflags *ipflags,
1661 struct ctdb_public_ip_list *all_ips,
1662 int num_ips)
1664 int i, numnodes;
1665 int maxnode, maxnum, minnode, minnum, num, retries;
1666 struct ctdb_public_ip_list *tmp_ip;
1668 numnodes = talloc_array_length(ipflags);
1669 retries = 0;
1671 try_again:
1672 maxnum=0;
1673 minnum=0;
1675 /* for each ip address, loop over all nodes that can serve
1676 this ip and make sure that the difference between the node
1677 serving the most and the node serving the least ip's are
1678 not greater than 1.
1680 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1681 if (tmp_ip->pnn == -1) {
1682 continue;
1685 /* Get the highest and lowest number of ips's served by any
1686 valid node which can serve this ip.
1688 maxnode = -1;
1689 minnode = -1;
1690 for (i=0; i<numnodes; i++) {
1691 /* only check nodes that can actually serve this ip */
1692 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1693 /* no it couldnt so skip to the next node */
1694 continue;
1697 num = node_ip_coverage(ctdb, i, all_ips);
1698 if (maxnode == -1) {
1699 maxnode = i;
1700 maxnum = num;
1701 } else {
1702 if (num > maxnum) {
1703 maxnode = i;
1704 maxnum = num;
1707 if (minnode == -1) {
1708 minnode = i;
1709 minnum = num;
1710 } else {
1711 if (num < minnum) {
1712 minnode = i;
1713 minnum = num;
1717 if (maxnode == -1) {
1718 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1719 ctdb_addr_to_str(&tmp_ip->addr)));
1721 continue;
1724 /* if the spread between the smallest and largest coverage by
1725 a node is >=2 we steal one of the ips from the node with
1726 most coverage to even things out a bit.
1727 try to do this a limited number of times since we dont
1728 want to spend too much time balancing the ip coverage.
1730 if ( (maxnum > minnum+1)
1731 && (retries < (num_ips + 5)) ){
1732 struct ctdb_public_ip_list *tmp;
1734 /* Reassign one of maxnode's VNNs */
1735 for (tmp=all_ips;tmp;tmp=tmp->next) {
1736 if (tmp->pnn == maxnode) {
1737 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1738 retries++;
1739 goto try_again;;
1746 static void lcp2_init(struct ctdb_context *tmp_ctx,
1747 struct ctdb_ipflags *ipflags,
1748 struct ctdb_public_ip_list *all_ips,
1749 uint32_t *force_rebalance_nodes,
1750 uint32_t **lcp2_imbalances,
1751 bool **rebalance_candidates)
1753 int i, numnodes;
1754 struct ctdb_public_ip_list *tmp_ip;
1756 numnodes = talloc_array_length(ipflags);
1758 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1759 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1760 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1761 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1763 for (i=0; i<numnodes; i++) {
1764 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1765 /* First step: assume all nodes are candidates */
1766 (*rebalance_candidates)[i] = true;
1769 /* 2nd step: if a node has IPs assigned then it must have been
1770 * healthy before, so we remove it from consideration. This
1771 * is overkill but is all we have because we don't maintain
1772 * state between takeover runs. An alternative would be to
1773 * keep state and invalidate it every time the recovery master
1774 * changes.
1776 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1777 if (tmp_ip->pnn != -1) {
1778 (*rebalance_candidates)[tmp_ip->pnn] = false;
1782 /* 3rd step: if a node is forced to re-balance then
1783 we allow failback onto the node */
1784 if (force_rebalance_nodes == NULL) {
1785 return;
1787 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1788 uint32_t pnn = force_rebalance_nodes[i];
1789 if (pnn >= numnodes) {
1790 DEBUG(DEBUG_ERR,
1791 (__location__ "unknown node %u\n", pnn));
1792 continue;
1795 DEBUG(DEBUG_NOTICE,
1796 ("Forcing rebalancing of IPs to node %u\n", pnn));
1797 (*rebalance_candidates)[pnn] = true;
1801 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1802 * the IP/node combination that will cost the least.
1804 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1805 struct ctdb_ipflags *ipflags,
1806 struct ctdb_public_ip_list *all_ips,
1807 uint32_t *lcp2_imbalances)
1809 struct ctdb_public_ip_list *tmp_ip;
1810 int dstnode, numnodes;
1812 int minnode;
1813 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1814 struct ctdb_public_ip_list *minip;
1816 bool should_loop = true;
1817 bool have_unassigned = true;
1819 numnodes = talloc_array_length(ipflags);
1821 while (have_unassigned && should_loop) {
1822 should_loop = false;
1824 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1825 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1827 minnode = -1;
1828 mindsum = 0;
1829 minip = NULL;
1831 /* loop over each unassigned ip. */
1832 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1833 if (tmp_ip->pnn != -1) {
1834 continue;
1837 for (dstnode=0; dstnode<numnodes; dstnode++) {
1838 /* only check nodes that can actually takeover this ip */
1839 if (!can_node_takeover_ip(ctdb, dstnode,
1840 ipflags[dstnode],
1841 tmp_ip)) {
1842 /* no it couldnt so skip to the next node */
1843 continue;
1846 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1847 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1848 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1849 ctdb_addr_to_str(&(tmp_ip->addr)),
1850 dstnode,
1851 dstimbl - lcp2_imbalances[dstnode]));
1854 if ((minnode == -1) || (dstdsum < mindsum)) {
1855 minnode = dstnode;
1856 minimbl = dstimbl;
1857 mindsum = dstdsum;
1858 minip = tmp_ip;
1859 should_loop = true;
1864 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1866 /* If we found one then assign it to the given node. */
1867 if (minnode != -1) {
1868 minip->pnn = minnode;
1869 lcp2_imbalances[minnode] = minimbl;
1870 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1871 ctdb_addr_to_str(&(minip->addr)),
1872 minnode,
1873 mindsum));
1876 /* There might be a better way but at least this is clear. */
1877 have_unassigned = false;
1878 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1879 if (tmp_ip->pnn == -1) {
1880 have_unassigned = true;
1885 /* We know if we have an unassigned addresses so we might as
1886 * well optimise.
1888 if (have_unassigned) {
1889 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1890 if (tmp_ip->pnn == -1) {
1891 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1892 ctdb_addr_to_str(&tmp_ip->addr)));
1898 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1899 * to move IPs from, determines the best IP/destination node
1900 * combination to move from the source node.
1902 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1903 struct ctdb_ipflags *ipflags,
1904 struct ctdb_public_ip_list *all_ips,
1905 int srcnode,
1906 uint32_t *lcp2_imbalances,
1907 bool *rebalance_candidates)
1909 int dstnode, mindstnode, numnodes;
1910 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1911 uint32_t minsrcimbl, mindstimbl;
1912 struct ctdb_public_ip_list *minip;
1913 struct ctdb_public_ip_list *tmp_ip;
1915 /* Find an IP and destination node that best reduces imbalance. */
1916 srcimbl = 0;
1917 minip = NULL;
1918 minsrcimbl = 0;
1919 mindstnode = -1;
1920 mindstimbl = 0;
1922 numnodes = talloc_array_length(ipflags);
1924 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1925 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1926 srcnode, lcp2_imbalances[srcnode]));
1928 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1929 /* Only consider addresses on srcnode. */
1930 if (tmp_ip->pnn != srcnode) {
1931 continue;
1934 /* What is this IP address costing the source node? */
1935 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1936 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1938 /* Consider this IP address would cost each potential
1939 * destination node. Destination nodes are limited to
1940 * those that are newly healthy, since we don't want
1941 * to do gratuitous failover of IPs just to make minor
1942 * balance improvements.
1944 for (dstnode=0; dstnode<numnodes; dstnode++) {
1945 if (!rebalance_candidates[dstnode]) {
1946 continue;
1949 /* only check nodes that can actually takeover this ip */
1950 if (!can_node_takeover_ip(ctdb, dstnode,
1951 ipflags[dstnode], tmp_ip)) {
1952 /* no it couldnt so skip to the next node */
1953 continue;
1956 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1957 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1958 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1959 srcnode, -srcdsum,
1960 ctdb_addr_to_str(&(tmp_ip->addr)),
1961 dstnode, dstdsum));
1963 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1964 (dstdsum < srcdsum) && \
1965 ((mindstnode == -1) || \
1966 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1968 minip = tmp_ip;
1969 minsrcimbl = srcimbl;
1970 mindstnode = dstnode;
1971 mindstimbl = dstimbl;
1975 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1977 if (mindstnode != -1) {
1978 /* We found a move that makes things better... */
1979 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1980 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1981 ctdb_addr_to_str(&(minip->addr)),
1982 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1985 lcp2_imbalances[srcnode] = minsrcimbl;
1986 lcp2_imbalances[mindstnode] = mindstimbl;
1987 minip->pnn = mindstnode;
1989 return true;
1992 return false;
1996 struct lcp2_imbalance_pnn {
1997 uint32_t imbalance;
1998 int pnn;
2001 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
2003 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
2004 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
2006 if (lipa->imbalance > lipb->imbalance) {
2007 return -1;
2008 } else if (lipa->imbalance == lipb->imbalance) {
2009 return 0;
2010 } else {
2011 return 1;
2015 /* LCP2 algorithm for rebalancing the cluster. This finds the source
2016 * node with the highest LCP2 imbalance, and then determines the best
2017 * IP/destination node combination to move from the source node.
2019 static void lcp2_failback(struct ctdb_context *ctdb,
2020 struct ctdb_ipflags *ipflags,
2021 struct ctdb_public_ip_list *all_ips,
2022 uint32_t *lcp2_imbalances,
2023 bool *rebalance_candidates)
2025 int i, numnodes;
2026 struct lcp2_imbalance_pnn * lips;
2027 bool again;
2029 numnodes = talloc_array_length(ipflags);
2031 try_again:
2032 /* Put the imbalances and nodes into an array, sort them and
2033 * iterate through candidates. Usually the 1st one will be
2034 * used, so this doesn't cost much...
2036 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
2037 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
2038 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2039 for (i=0; i<numnodes; i++) {
2040 lips[i].imbalance = lcp2_imbalances[i];
2041 lips[i].pnn = i;
2042 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
2044 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2045 lcp2_cmp_imbalance_pnn);
2047 again = false;
2048 for (i=0; i<numnodes; i++) {
2049 /* This means that all nodes had 0 or 1 addresses, so
2050 * can't be imbalanced.
2052 if (lips[i].imbalance == 0) {
2053 break;
2056 if (lcp2_failback_candidate(ctdb,
2057 ipflags,
2058 all_ips,
2059 lips[i].pnn,
2060 lcp2_imbalances,
2061 rebalance_candidates)) {
2062 again = true;
2063 break;
2067 talloc_free(lips);
2068 if (again) {
2069 goto try_again;
2073 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2074 struct ctdb_ipflags *ipflags,
2075 struct ctdb_public_ip_list *all_ips)
2077 struct ctdb_public_ip_list *tmp_ip;
2079 /* verify that the assigned nodes can serve that public ip
2080 and set it to -1 if not
2082 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2083 if (tmp_ip->pnn == -1) {
2084 continue;
2086 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2087 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2088 /* this node can not serve this ip. */
2089 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2090 ctdb_addr_to_str(&(tmp_ip->addr)),
2091 tmp_ip->pnn));
2092 tmp_ip->pnn = -1;
2097 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2098 struct ctdb_ipflags *ipflags,
2099 struct ctdb_public_ip_list *all_ips)
2101 struct ctdb_public_ip_list *tmp_ip;
2102 int i, numnodes;
2104 numnodes = talloc_array_length(ipflags);
2106 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2107 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2108 * always be allocated the same way for a specific set of
2109 * available/unavailable nodes.
2112 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2113 tmp_ip->pnn = i % numnodes;
2116 /* IP failback doesn't make sense with deterministic
2117 * IPs, since the modulo step above implicitly fails
2118 * back IPs to their "home" node.
2120 if (1 == ctdb->tunable.no_ip_failback) {
2121 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2124 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2126 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2128 /* No failback here! */
2131 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2132 struct ctdb_ipflags *ipflags,
2133 struct ctdb_public_ip_list *all_ips)
2135 /* This should be pushed down into basic_failback. */
2136 struct ctdb_public_ip_list *tmp_ip;
2137 int num_ips = 0;
2138 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2139 num_ips++;
2142 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2144 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2146 /* If we don't want IPs to fail back then don't rebalance IPs. */
2147 if (1 == ctdb->tunable.no_ip_failback) {
2148 return;
2151 /* Now, try to make sure the ip adresses are evenly distributed
2152 across the nodes.
2154 basic_failback(ctdb, ipflags, all_ips, num_ips);
2157 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2158 struct ctdb_ipflags *ipflags,
2159 struct ctdb_public_ip_list *all_ips,
2160 uint32_t *force_rebalance_nodes)
2162 uint32_t *lcp2_imbalances;
2163 bool *rebalance_candidates;
2164 int numnodes, num_rebalance_candidates, i;
2166 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2168 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2170 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2171 &lcp2_imbalances, &rebalance_candidates);
2173 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2175 /* If we don't want IPs to fail back then don't rebalance IPs. */
2176 if (1 == ctdb->tunable.no_ip_failback) {
2177 goto finished;
2180 /* It is only worth continuing if we have suitable target
2181 * nodes to transfer IPs to. This check is much cheaper than
2182 * continuing on...
2184 numnodes = talloc_array_length(ipflags);
2185 num_rebalance_candidates = 0;
2186 for (i=0; i<numnodes; i++) {
2187 if (rebalance_candidates[i]) {
2188 num_rebalance_candidates++;
2191 if (num_rebalance_candidates == 0) {
2192 goto finished;
2195 /* Now, try to make sure the ip adresses are evenly distributed
2196 across the nodes.
2198 lcp2_failback(ctdb, ipflags, all_ips,
2199 lcp2_imbalances, rebalance_candidates);
2201 finished:
2202 talloc_free(tmp_ctx);
2205 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2207 int i, num_healthy;
2209 /* Count how many completely healthy nodes we have */
2210 num_healthy = 0;
2211 for (i=0;i<nodemap->num;i++) {
2212 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2213 num_healthy++;
2217 return num_healthy == 0;
2220 /* The calculation part of the IP allocation algorithm. */
2221 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2222 struct ctdb_ipflags *ipflags,
2223 struct ctdb_public_ip_list **all_ips_p,
2224 uint32_t *force_rebalance_nodes)
2226 /* since nodes only know about those public addresses that
2227 can be served by that particular node, no single node has
2228 a full list of all public addresses that exist in the cluster.
2229 Walk over all node structures and create a merged list of
2230 all public addresses that exist in the cluster.
2232 keep the tree of ips around as ctdb->ip_tree
2234 *all_ips_p = create_merged_ip_list(ctdb);
2236 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2237 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2238 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2239 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2240 } else {
2241 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2244 /* at this point ->pnn is the node which will own each IP
2245 or -1 if there is no node that can cover this ip
2248 return;
2251 struct get_tunable_callback_data {
2252 const char *tunable;
2253 uint32_t *out;
2254 bool fatal;
2257 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2258 int32_t res, TDB_DATA outdata,
2259 void *callback)
2261 struct get_tunable_callback_data *cd =
2262 (struct get_tunable_callback_data *)callback;
2263 int size;
2265 if (res != 0) {
2266 /* Already handled in fail callback */
2267 return;
2270 if (outdata.dsize != sizeof(uint32_t)) {
2271 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2272 cd->tunable, pnn, (int)sizeof(uint32_t),
2273 (int)outdata.dsize));
2274 cd->fatal = true;
2275 return;
2278 size = talloc_array_length(cd->out);
2279 if (pnn >= size) {
2280 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2281 cd->tunable, pnn, size));
2282 return;
2286 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2289 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2290 int32_t res, TDB_DATA outdata,
2291 void *callback)
2293 struct get_tunable_callback_data *cd =
2294 (struct get_tunable_callback_data *)callback;
2296 switch (res) {
2297 case -ETIME:
2298 DEBUG(DEBUG_ERR,
2299 ("Timed out getting tunable \"%s\" from node %d\n",
2300 cd->tunable, pnn));
2301 cd->fatal = true;
2302 break;
2303 case -EINVAL:
2304 case -1:
2305 DEBUG(DEBUG_WARNING,
2306 ("Tunable \"%s\" not implemented on node %d\n",
2307 cd->tunable, pnn));
2308 break;
2309 default:
2310 DEBUG(DEBUG_ERR,
2311 ("Unexpected error getting tunable \"%s\" from node %d\n",
2312 cd->tunable, pnn));
2313 cd->fatal = true;
2317 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2318 TALLOC_CTX *tmp_ctx,
2319 struct ctdb_node_map *nodemap,
2320 const char *tunable,
2321 uint32_t default_value)
2323 TDB_DATA data;
2324 struct ctdb_control_get_tunable *t;
2325 uint32_t *nodes;
2326 uint32_t *tvals;
2327 struct get_tunable_callback_data callback_data;
2328 int i;
2330 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2331 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2332 for (i=0; i<nodemap->num; i++) {
2333 tvals[i] = default_value;
2336 callback_data.out = tvals;
2337 callback_data.tunable = tunable;
2338 callback_data.fatal = false;
2340 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2341 data.dptr = talloc_size(tmp_ctx, data.dsize);
2342 t = (struct ctdb_control_get_tunable *)data.dptr;
2343 t->length = strlen(tunable)+1;
2344 memcpy(t->name, tunable, t->length);
2345 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2346 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2347 nodes, 0, TAKEOVER_TIMEOUT(),
2348 false, data,
2349 get_tunable_callback,
2350 get_tunable_fail_callback,
2351 &callback_data) != 0) {
2352 if (callback_data.fatal) {
2353 talloc_free(tvals);
2354 tvals = NULL;
2357 talloc_free(nodes);
2358 talloc_free(data.dptr);
2360 return tvals;
2363 struct get_runstate_callback_data {
2364 enum ctdb_runstate *out;
2365 bool fatal;
2368 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2369 int32_t res, TDB_DATA outdata,
2370 void *callback_data)
2372 struct get_runstate_callback_data *cd =
2373 (struct get_runstate_callback_data *)callback_data;
2374 int size;
2376 if (res != 0) {
2377 /* Already handled in fail callback */
2378 return;
2381 if (outdata.dsize != sizeof(uint32_t)) {
2382 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2383 pnn, (int)sizeof(uint32_t),
2384 (int)outdata.dsize));
2385 cd->fatal = true;
2386 return;
2389 size = talloc_array_length(cd->out);
2390 if (pnn >= size) {
2391 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2392 pnn, size));
2393 return;
2396 cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2399 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2400 int32_t res, TDB_DATA outdata,
2401 void *callback)
2403 struct get_runstate_callback_data *cd =
2404 (struct get_runstate_callback_data *)callback;
2406 switch (res) {
2407 case -ETIME:
2408 DEBUG(DEBUG_ERR,
2409 ("Timed out getting runstate from node %d\n", pnn));
2410 cd->fatal = true;
2411 break;
2412 default:
2413 DEBUG(DEBUG_WARNING,
2414 ("Error getting runstate from node %d - assuming runstates not supported\n",
2415 pnn));
2419 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2420 TALLOC_CTX *tmp_ctx,
2421 struct ctdb_node_map *nodemap,
2422 enum ctdb_runstate default_value)
2424 uint32_t *nodes;
2425 enum ctdb_runstate *rs;
2426 struct get_runstate_callback_data callback_data;
2427 int i;
2429 rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2430 CTDB_NO_MEMORY_NULL(ctdb, rs);
2431 for (i=0; i<nodemap->num; i++) {
2432 rs[i] = default_value;
2435 callback_data.out = rs;
2436 callback_data.fatal = false;
2438 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2439 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2440 nodes, 0, TAKEOVER_TIMEOUT(),
2441 true, tdb_null,
2442 get_runstate_callback,
2443 get_runstate_fail_callback,
2444 &callback_data) != 0) {
2445 if (callback_data.fatal) {
2446 free(rs);
2447 rs = NULL;
2450 talloc_free(nodes);
2452 return rs;
2455 /* Set internal flags for IP allocation:
2456 * Clear ip flags
2457 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2458 * Set NOIPHOST ip flag for each INACTIVE node
2459 * if all nodes are disabled:
2460 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2461 * else
2462 * Set NOIPHOST ip flags for disabled nodes
2464 static struct ctdb_ipflags *
2465 set_ipflags_internal(struct ctdb_context *ctdb,
2466 TALLOC_CTX *tmp_ctx,
2467 struct ctdb_node_map *nodemap,
2468 uint32_t *tval_noiptakeover,
2469 uint32_t *tval_noiphostonalldisabled,
2470 enum ctdb_runstate *runstate)
2472 int i;
2473 struct ctdb_ipflags *ipflags;
2475 /* Clear IP flags - implicit due to talloc_zero */
2476 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2477 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2479 for (i=0;i<nodemap->num;i++) {
2480 /* Can not take IPs on node with NoIPTakeover set */
2481 if (tval_noiptakeover[i] != 0) {
2482 ipflags[i].noiptakeover = true;
2485 /* Can not host IPs on node not in RUNNING state */
2486 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2487 ipflags[i].noiphost = true;
2488 continue;
2490 /* Can not host IPs on INACTIVE node */
2491 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2492 ipflags[i].noiphost = true;
2496 if (all_nodes_are_disabled(nodemap)) {
2497 /* If all nodes are disabled, can not host IPs on node
2498 * with NoIPHostOnAllDisabled set
2500 for (i=0;i<nodemap->num;i++) {
2501 if (tval_noiphostonalldisabled[i] != 0) {
2502 ipflags[i].noiphost = true;
2505 } else {
2506 /* If some nodes are not disabled, then can not host
2507 * IPs on DISABLED node
2509 for (i=0;i<nodemap->num;i++) {
2510 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2511 ipflags[i].noiphost = true;
2516 return ipflags;
2519 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2520 TALLOC_CTX *tmp_ctx,
2521 struct ctdb_node_map *nodemap)
2523 uint32_t *tval_noiptakeover;
2524 uint32_t *tval_noiphostonalldisabled;
2525 struct ctdb_ipflags *ipflags;
2526 enum ctdb_runstate *runstate;
2529 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2530 "NoIPTakeover", 0);
2531 if (tval_noiptakeover == NULL) {
2532 return NULL;
2535 tval_noiphostonalldisabled =
2536 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2537 "NoIPHostOnAllDisabled", 0);
2538 if (tval_noiphostonalldisabled == NULL) {
2539 /* Caller frees tmp_ctx */
2540 return NULL;
2543 /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2544 * will default to CTDB_RUNSTATE_RUNNING. This ensures
2545 * reasonable behaviour on a mixed cluster during upgrade.
2547 runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2548 CTDB_RUNSTATE_RUNNING);
2549 if (runstate == NULL) {
2550 /* Caller frees tmp_ctx */
2551 return NULL;
2554 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2555 tval_noiptakeover,
2556 tval_noiphostonalldisabled,
2557 runstate);
2559 talloc_free(tval_noiptakeover);
2560 talloc_free(tval_noiphostonalldisabled);
2561 talloc_free(runstate);
2563 return ipflags;
2566 struct iprealloc_callback_data {
2567 bool *retry_nodes;
2568 int retry_count;
2569 client_async_callback fail_callback;
2570 void *fail_callback_data;
2571 struct ctdb_node_map *nodemap;
2574 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2575 int32_t res, TDB_DATA outdata,
2576 void *callback)
2578 int numnodes;
2579 struct iprealloc_callback_data *cd =
2580 (struct iprealloc_callback_data *)callback;
2582 numnodes = talloc_array_length(cd->retry_nodes);
2583 if (pnn > numnodes) {
2584 DEBUG(DEBUG_ERR,
2585 ("ipreallocated failure from node %d, "
2586 "but only %d nodes in nodemap\n",
2587 pnn, numnodes));
2588 return;
2591 /* Can't run the "ipreallocated" event on a INACTIVE node */
2592 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2593 DEBUG(DEBUG_WARNING,
2594 ("ipreallocated failed on inactive node %d, ignoring\n",
2595 pnn));
2596 return;
2599 switch (res) {
2600 case -ETIME:
2601 /* If the control timed out then that's a real error,
2602 * so call the real fail callback
2604 if (cd->fail_callback) {
2605 cd->fail_callback(ctdb, pnn, res, outdata,
2606 cd->fail_callback_data);
2607 } else {
2608 DEBUG(DEBUG_WARNING,
2609 ("iprealloc timed out but no callback registered\n"));
2611 break;
2612 default:
2613 /* If not a timeout then either the ipreallocated
2614 * eventscript (or some setup) failed. This might
2615 * have failed because the IPREALLOCATED control isn't
2616 * implemented - right now there is no way of knowing
2617 * because the error codes are all folded down to -1.
2618 * Consider retrying using EVENTSCRIPT control...
2620 DEBUG(DEBUG_WARNING,
2621 ("ipreallocated failure from node %d, flagging retry\n",
2622 pnn));
2623 cd->retry_nodes[pnn] = true;
2624 cd->retry_count++;
2628 struct takeover_callback_data {
2629 bool *node_failed;
2630 client_async_callback fail_callback;
2631 void *fail_callback_data;
2632 struct ctdb_node_map *nodemap;
2635 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2636 uint32_t node_pnn, int32_t res,
2637 TDB_DATA outdata, void *callback_data)
2639 struct takeover_callback_data *cd =
2640 talloc_get_type_abort(callback_data,
2641 struct takeover_callback_data);
2642 int i;
2644 for (i = 0; i < cd->nodemap->num; i++) {
2645 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2646 break;
2650 if (i == cd->nodemap->num) {
2651 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2652 return;
2655 if (!cd->node_failed[i]) {
2656 cd->node_failed[i] = true;
2657 cd->fail_callback(ctdb, node_pnn, res, outdata,
2658 cd->fail_callback_data);
2663 make any IP alias changes for public addresses that are necessary
2665 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2666 uint32_t *force_rebalance_nodes,
2667 client_async_callback fail_callback, void *callback_data)
2669 int i, j, ret;
2670 struct ctdb_public_ip ip;
2671 struct ctdb_public_ipv4 ipv4;
2672 uint32_t *nodes;
2673 struct ctdb_public_ip_list *all_ips, *tmp_ip;
2674 TDB_DATA data;
2675 struct timeval timeout;
2676 struct client_async_data *async_data;
2677 struct ctdb_client_control_state *state;
2678 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2679 struct ctdb_ipflags *ipflags;
2680 struct takeover_callback_data *takeover_data;
2681 struct iprealloc_callback_data iprealloc_data;
2682 bool *retry_data;
2685 * ip failover is completely disabled, just send out the
2686 * ipreallocated event.
2688 if (ctdb->tunable.disable_ip_failover != 0) {
2689 goto ipreallocated;
2692 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2693 if (ipflags == NULL) {
2694 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2695 talloc_free(tmp_ctx);
2696 return -1;
2699 ZERO_STRUCT(ip);
2701 /* Do the IP reassignment calculations */
2702 ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2704 /* Now tell all nodes to release any public IPs should not
2705 * host. This will be a NOOP on nodes that don't currently
2706 * hold the given IP.
2708 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2709 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2711 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2712 bool, nodemap->num);
2713 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2714 takeover_data->fail_callback = fail_callback;
2715 takeover_data->fail_callback_data = callback_data;
2716 takeover_data->nodemap = nodemap;
2718 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2719 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2721 async_data->fail_callback = takeover_run_fail_callback;
2722 async_data->callback_data = takeover_data;
2724 for (i=0;i<nodemap->num;i++) {
2725 /* don't talk to unconnected nodes, but do talk to banned nodes */
2726 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2727 continue;
2730 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2731 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2732 /* This node should be serving this
2733 vnn so dont tell it to release the ip
2735 continue;
2737 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2738 ipv4.pnn = tmp_ip->pnn;
2739 ipv4.sin = tmp_ip->addr.ip;
2741 timeout = TAKEOVER_TIMEOUT();
2742 data.dsize = sizeof(ipv4);
2743 data.dptr = (uint8_t *)&ipv4;
2744 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2745 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2746 data, async_data,
2747 &timeout, NULL);
2748 } else {
2749 ip.pnn = tmp_ip->pnn;
2750 ip.addr = tmp_ip->addr;
2752 timeout = TAKEOVER_TIMEOUT();
2753 data.dsize = sizeof(ip);
2754 data.dptr = (uint8_t *)&ip;
2755 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2756 0, CTDB_CONTROL_RELEASE_IP, 0,
2757 data, async_data,
2758 &timeout, NULL);
2761 if (state == NULL) {
2762 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2763 talloc_free(tmp_ctx);
2764 return -1;
2767 ctdb_client_async_add(async_data, state);
2770 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2771 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2772 talloc_free(tmp_ctx);
2773 return -1;
2775 talloc_free(async_data);
2778 /* tell all nodes to get their own IPs */
2779 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2780 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2782 async_data->fail_callback = fail_callback;
2783 async_data->callback_data = callback_data;
2785 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2786 if (tmp_ip->pnn == -1) {
2787 /* this IP won't be taken over */
2788 continue;
2791 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2792 ipv4.pnn = tmp_ip->pnn;
2793 ipv4.sin = tmp_ip->addr.ip;
2795 timeout = TAKEOVER_TIMEOUT();
2796 data.dsize = sizeof(ipv4);
2797 data.dptr = (uint8_t *)&ipv4;
2798 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2799 0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2800 data, async_data,
2801 &timeout, NULL);
2802 } else {
2803 ip.pnn = tmp_ip->pnn;
2804 ip.addr = tmp_ip->addr;
2806 timeout = TAKEOVER_TIMEOUT();
2807 data.dsize = sizeof(ip);
2808 data.dptr = (uint8_t *)&ip;
2809 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2810 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2811 data, async_data,
2812 &timeout, NULL);
2814 if (state == NULL) {
2815 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2816 talloc_free(tmp_ctx);
2817 return -1;
2820 ctdb_client_async_add(async_data, state);
2822 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2823 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2824 talloc_free(tmp_ctx);
2825 return -1;
2828 ipreallocated:
2830 * Tell all nodes to run eventscripts to process the
2831 * "ipreallocated" event. This can do a lot of things,
2832 * including restarting services to reconfigure them if public
2833 * IPs have moved. Once upon a time this event only used to
2834 * update natwg.
2836 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2837 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2838 iprealloc_data.retry_nodes = retry_data;
2839 iprealloc_data.retry_count = 0;
2840 iprealloc_data.fail_callback = fail_callback;
2841 iprealloc_data.fail_callback_data = callback_data;
2842 iprealloc_data.nodemap = nodemap;
2844 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2845 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2846 nodes, 0, TAKEOVER_TIMEOUT(),
2847 false, tdb_null,
2848 NULL, iprealloc_fail_callback,
2849 &iprealloc_data);
2850 if (ret != 0) {
2851 /* If the control failed then we should retry to any
2852 * nodes flagged by iprealloc_fail_callback using the
2853 * EVENTSCRIPT control. This is a best-effort at
2854 * backward compatiblity when running a mixed cluster
2855 * where some nodes have not yet been upgraded to
2856 * support the IPREALLOCATED control.
2858 DEBUG(DEBUG_WARNING,
2859 ("Retry ipreallocated to some nodes using eventscript control\n"));
2861 nodes = talloc_array(tmp_ctx, uint32_t,
2862 iprealloc_data.retry_count);
2863 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2865 j = 0;
2866 for (i=0; i<nodemap->num; i++) {
2867 if (iprealloc_data.retry_nodes[i]) {
2868 nodes[j] = i;
2869 j++;
2873 data.dptr = discard_const("ipreallocated");
2874 data.dsize = strlen((char *)data.dptr) + 1;
2875 ret = ctdb_client_async_control(ctdb,
2876 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2877 nodes, 0, TAKEOVER_TIMEOUT(),
2878 false, data,
2879 NULL, fail_callback,
2880 callback_data);
2881 if (ret != 0) {
2882 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2886 talloc_free(tmp_ctx);
2887 return ret;
2892 destroy a ctdb_client_ip structure
2894 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2896 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2897 ctdb_addr_to_str(&ip->addr),
2898 ntohs(ip->addr.ip.sin_port),
2899 ip->client_id));
2901 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2902 return 0;
2906 called by a client to inform us of a TCP connection that it is managing
2907 that should tickled with an ACK when IP takeover is done
2908 we handle both the old ipv4 style of packets as well as the new ipv4/6
2909 pdus.
2911 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2912 TDB_DATA indata)
2914 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2915 struct ctdb_control_tcp *old_addr = NULL;
2916 struct ctdb_control_tcp_addr new_addr;
2917 struct ctdb_control_tcp_addr *tcp_sock = NULL;
2918 struct ctdb_tcp_list *tcp;
2919 struct ctdb_tcp_connection t;
2920 int ret;
2921 TDB_DATA data;
2922 struct ctdb_client_ip *ip;
2923 struct ctdb_vnn *vnn;
2924 ctdb_sock_addr addr;
2926 /* If we don't have public IPs, tickles are useless */
2927 if (ctdb->vnn == NULL) {
2928 return 0;
2931 switch (indata.dsize) {
2932 case sizeof(struct ctdb_control_tcp):
2933 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2934 ZERO_STRUCT(new_addr);
2935 tcp_sock = &new_addr;
2936 tcp_sock->src.ip = old_addr->src;
2937 tcp_sock->dest.ip = old_addr->dest;
2938 break;
2939 case sizeof(struct ctdb_control_tcp_addr):
2940 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2941 break;
2942 default:
2943 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2944 "to ctdb_control_tcp_client. size was %d but "
2945 "only allowed sizes are %lu and %lu\n",
2946 (int)indata.dsize,
2947 (long unsigned)sizeof(struct ctdb_control_tcp),
2948 (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2949 return -1;
2952 addr = tcp_sock->src;
2953 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2954 addr = tcp_sock->dest;
2955 ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2957 ZERO_STRUCT(addr);
2958 memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2959 vnn = find_public_ip_vnn(ctdb, &addr);
2960 if (vnn == NULL) {
2961 switch (addr.sa.sa_family) {
2962 case AF_INET:
2963 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2964 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2965 ctdb_addr_to_str(&addr)));
2967 break;
2968 case AF_INET6:
2969 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2970 ctdb_addr_to_str(&addr)));
2971 break;
2972 default:
2973 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2976 return 0;
2979 if (vnn->pnn != ctdb->pnn) {
2980 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2981 ctdb_addr_to_str(&addr),
2982 client_id, client->pid));
2983 /* failing this call will tell smbd to die */
2984 return -1;
2987 ip = talloc(client, struct ctdb_client_ip);
2988 CTDB_NO_MEMORY(ctdb, ip);
2990 ip->ctdb = ctdb;
2991 ip->addr = addr;
2992 ip->client_id = client_id;
2993 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2994 DLIST_ADD(ctdb->client_ip_list, ip);
2996 tcp = talloc(client, struct ctdb_tcp_list);
2997 CTDB_NO_MEMORY(ctdb, tcp);
2999 tcp->connection.src_addr = tcp_sock->src;
3000 tcp->connection.dst_addr = tcp_sock->dest;
3002 DLIST_ADD(client->tcp_list, tcp);
3004 t.src_addr = tcp_sock->src;
3005 t.dst_addr = tcp_sock->dest;
3007 data.dptr = (uint8_t *)&t;
3008 data.dsize = sizeof(t);
3010 switch (addr.sa.sa_family) {
3011 case AF_INET:
3012 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3013 (unsigned)ntohs(tcp_sock->dest.ip.sin_port),
3014 ctdb_addr_to_str(&tcp_sock->src),
3015 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
3016 break;
3017 case AF_INET6:
3018 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
3019 (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port),
3020 ctdb_addr_to_str(&tcp_sock->src),
3021 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
3022 break;
3023 default:
3024 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
3028 /* tell all nodes about this tcp connection */
3029 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
3030 CTDB_CONTROL_TCP_ADD,
3031 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3032 if (ret != 0) {
3033 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
3034 return -1;
3037 return 0;
3041 find a tcp address on a list
3043 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
3044 struct ctdb_tcp_connection *tcp)
3046 int i;
3048 if (array == NULL) {
3049 return NULL;
3052 for (i=0;i<array->num;i++) {
3053 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
3054 ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
3055 return &array->connections[i];
3058 return NULL;
3064 called by a daemon to inform us of a TCP connection that one of its
3065 clients managing that should tickled with an ACK when IP takeover is
3066 done
3068 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
3070 struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
3071 struct ctdb_tcp_array *tcparray;
3072 struct ctdb_tcp_connection tcp;
3073 struct ctdb_vnn *vnn;
3075 /* If we don't have public IPs, tickles are useless */
3076 if (ctdb->vnn == NULL) {
3077 return 0;
3080 vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
3081 if (vnn == NULL) {
3082 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
3083 ctdb_addr_to_str(&p->dst_addr)));
3085 return -1;
3089 tcparray = vnn->tcp_array;
3091 /* If this is the first tickle */
3092 if (tcparray == NULL) {
3093 tcparray = talloc(vnn, struct ctdb_tcp_array);
3094 CTDB_NO_MEMORY(ctdb, tcparray);
3095 vnn->tcp_array = tcparray;
3097 tcparray->num = 0;
3098 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
3099 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3101 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3102 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3103 tcparray->num++;
3105 if (tcp_update_needed) {
3106 vnn->tcp_update_needed = true;
3108 return 0;
3112 /* Do we already have this tickle ?*/
3113 tcp.src_addr = p->src_addr;
3114 tcp.dst_addr = p->dst_addr;
3115 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
3116 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
3117 ctdb_addr_to_str(&tcp.dst_addr),
3118 ntohs(tcp.dst_addr.ip.sin_port),
3119 vnn->pnn));
3120 return 0;
3123 /* A new tickle, we must add it to the array */
3124 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
3125 struct ctdb_tcp_connection,
3126 tcparray->num+1);
3127 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3129 tcparray->connections[tcparray->num].src_addr = p->src_addr;
3130 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
3131 tcparray->num++;
3133 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3134 ctdb_addr_to_str(&tcp.dst_addr),
3135 ntohs(tcp.dst_addr.ip.sin_port),
3136 vnn->pnn));
3138 if (tcp_update_needed) {
3139 vnn->tcp_update_needed = true;
3142 return 0;
3147 called by a daemon to inform us of a TCP connection that one of its
3148 clients managing that should tickled with an ACK when IP takeover is
3149 done
3151 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3153 struct ctdb_tcp_connection *tcpp;
3154 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3156 if (vnn == NULL) {
3157 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3158 ctdb_addr_to_str(&conn->dst_addr)));
3159 return;
3162 /* if the array is empty we cant remove it
3163 and we dont need to do anything
3165 if (vnn->tcp_array == NULL) {
3166 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3167 ctdb_addr_to_str(&conn->dst_addr),
3168 ntohs(conn->dst_addr.ip.sin_port)));
3169 return;
3173 /* See if we know this connection
3174 if we dont know this connection then we dont need to do anything
3176 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3177 if (tcpp == NULL) {
3178 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3179 ctdb_addr_to_str(&conn->dst_addr),
3180 ntohs(conn->dst_addr.ip.sin_port)));
3181 return;
3185 /* We need to remove this entry from the array.
3186 Instead of allocating a new array and copying data to it
3187 we cheat and just copy the last entry in the existing array
3188 to the entry that is to be removed and just shring the
3189 ->num field
3191 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3192 vnn->tcp_array->num--;
3194 /* If we deleted the last entry we also need to remove the entire array
3196 if (vnn->tcp_array->num == 0) {
3197 talloc_free(vnn->tcp_array);
3198 vnn->tcp_array = NULL;
3201 vnn->tcp_update_needed = true;
3203 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3204 ctdb_addr_to_str(&conn->src_addr),
3205 ntohs(conn->src_addr.ip.sin_port)));
3210 called by a daemon to inform us of a TCP connection that one of its
3211 clients used are no longer needed in the tickle database
3213 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3215 struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3217 /* If we don't have public IPs, tickles are useless */
3218 if (ctdb->vnn == NULL) {
3219 return 0;
3222 ctdb_remove_tcp_connection(ctdb, conn);
3224 return 0;
3229 Called when another daemon starts - caises all tickles for all
3230 public addresses we are serving to be sent to the new node on the
3231 next check. This actually causes the next scheduled call to
3232 tdb_update_tcp_tickles() to update all nodes. This is simple and
3233 doesn't require careful error handling.
3235 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3237 struct ctdb_vnn *vnn;
3239 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3240 vnn->tcp_update_needed = true;
3243 return 0;
3248 called when a client structure goes away - hook to remove
3249 elements from the tcp_list in all daemons
3251 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3253 while (client->tcp_list) {
3254 struct ctdb_tcp_list *tcp = client->tcp_list;
3255 DLIST_REMOVE(client->tcp_list, tcp);
3256 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3262 release all IPs on shutdown
3264 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3266 struct ctdb_vnn *vnn;
3267 int count = 0;
3269 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3270 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3271 ctdb_vnn_unassign_iface(ctdb, vnn);
3272 continue;
3274 if (!vnn->iface) {
3275 continue;
3278 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3279 ctdb_addr_to_str(&vnn->public_address),
3280 vnn->public_netmask_bits,
3281 ctdb_vnn_iface_string(vnn)));
3283 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3284 ctdb_vnn_iface_string(vnn),
3285 ctdb_addr_to_str(&vnn->public_address),
3286 vnn->public_netmask_bits);
3287 release_kill_clients(ctdb, &vnn->public_address);
3288 ctdb_vnn_unassign_iface(ctdb, vnn);
3289 count++;
3292 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3297 get list of public IPs
3299 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3300 struct ctdb_req_control *c, TDB_DATA *outdata)
3302 int i, num, len;
3303 struct ctdb_all_public_ips *ips;
3304 struct ctdb_vnn *vnn;
3305 bool only_available = false;
3307 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3308 only_available = true;
3311 /* count how many public ip structures we have */
3312 num = 0;
3313 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3314 num++;
3317 len = offsetof(struct ctdb_all_public_ips, ips) +
3318 num*sizeof(struct ctdb_public_ip);
3319 ips = talloc_zero_size(outdata, len);
3320 CTDB_NO_MEMORY(ctdb, ips);
3322 i = 0;
3323 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3324 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3325 continue;
3327 ips->ips[i].pnn = vnn->pnn;
3328 ips->ips[i].addr = vnn->public_address;
3329 i++;
3331 ips->num = i;
3332 len = offsetof(struct ctdb_all_public_ips, ips) +
3333 i*sizeof(struct ctdb_public_ip);
3335 outdata->dsize = len;
3336 outdata->dptr = (uint8_t *)ips;
3338 return 0;
3343 get list of public IPs, old ipv4 style. only returns ipv4 addresses
3345 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb,
3346 struct ctdb_req_control *c, TDB_DATA *outdata)
3348 int i, num, len;
3349 struct ctdb_all_public_ipsv4 *ips;
3350 struct ctdb_vnn *vnn;
3352 /* count how many public ip structures we have */
3353 num = 0;
3354 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3355 if (vnn->public_address.sa.sa_family != AF_INET) {
3356 continue;
3358 num++;
3361 len = offsetof(struct ctdb_all_public_ipsv4, ips) +
3362 num*sizeof(struct ctdb_public_ipv4);
3363 ips = talloc_zero_size(outdata, len);
3364 CTDB_NO_MEMORY(ctdb, ips);
3366 outdata->dsize = len;
3367 outdata->dptr = (uint8_t *)ips;
3369 ips->num = num;
3370 i = 0;
3371 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3372 if (vnn->public_address.sa.sa_family != AF_INET) {
3373 continue;
3375 ips->ips[i].pnn = vnn->pnn;
3376 ips->ips[i].sin = vnn->public_address.ip;
3377 i++;
3380 return 0;
3383 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3384 struct ctdb_req_control *c,
3385 TDB_DATA indata,
3386 TDB_DATA *outdata)
3388 int i, num, len;
3389 ctdb_sock_addr *addr;
3390 struct ctdb_control_public_ip_info *info;
3391 struct ctdb_vnn *vnn;
3393 addr = (ctdb_sock_addr *)indata.dptr;
3395 vnn = find_public_ip_vnn(ctdb, addr);
3396 if (vnn == NULL) {
3397 /* if it is not a public ip it could be our 'single ip' */
3398 if (ctdb->single_ip_vnn) {
3399 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3400 vnn = ctdb->single_ip_vnn;
3404 if (vnn == NULL) {
3405 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3406 "'%s'not a public address\n",
3407 ctdb_addr_to_str(addr)));
3408 return -1;
3411 /* count how many public ip structures we have */
3412 num = 0;
3413 for (;vnn->ifaces[num];) {
3414 num++;
3417 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3418 num*sizeof(struct ctdb_control_iface_info);
3419 info = talloc_zero_size(outdata, len);
3420 CTDB_NO_MEMORY(ctdb, info);
3422 info->ip.addr = vnn->public_address;
3423 info->ip.pnn = vnn->pnn;
3424 info->active_idx = 0xFFFFFFFF;
3426 for (i=0; vnn->ifaces[i]; i++) {
3427 struct ctdb_iface *cur;
3429 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3430 if (cur == NULL) {
3431 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3432 vnn->ifaces[i]));
3433 return -1;
3435 if (vnn->iface == cur) {
3436 info->active_idx = i;
3438 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3439 info->ifaces[i].link_state = cur->link_up;
3440 info->ifaces[i].references = cur->references;
3442 info->num = i;
3443 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3444 i*sizeof(struct ctdb_control_iface_info);
3446 outdata->dsize = len;
3447 outdata->dptr = (uint8_t *)info;
3449 return 0;
3452 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3453 struct ctdb_req_control *c,
3454 TDB_DATA *outdata)
3456 int i, num, len;
3457 struct ctdb_control_get_ifaces *ifaces;
3458 struct ctdb_iface *cur;
3460 /* count how many public ip structures we have */
3461 num = 0;
3462 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3463 num++;
3466 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3467 num*sizeof(struct ctdb_control_iface_info);
3468 ifaces = talloc_zero_size(outdata, len);
3469 CTDB_NO_MEMORY(ctdb, ifaces);
3471 i = 0;
3472 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3473 strcpy(ifaces->ifaces[i].name, cur->name);
3474 ifaces->ifaces[i].link_state = cur->link_up;
3475 ifaces->ifaces[i].references = cur->references;
3476 i++;
3478 ifaces->num = i;
3479 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3480 i*sizeof(struct ctdb_control_iface_info);
3482 outdata->dsize = len;
3483 outdata->dptr = (uint8_t *)ifaces;
3485 return 0;
3488 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3489 struct ctdb_req_control *c,
3490 TDB_DATA indata)
3492 struct ctdb_control_iface_info *info;
3493 struct ctdb_iface *iface;
3494 bool link_up = false;
3496 info = (struct ctdb_control_iface_info *)indata.dptr;
3498 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3499 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3500 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3501 len, len, info->name));
3502 return -1;
3505 switch (info->link_state) {
3506 case 0:
3507 link_up = false;
3508 break;
3509 case 1:
3510 link_up = true;
3511 break;
3512 default:
3513 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3514 (unsigned int)info->link_state));
3515 return -1;
3518 if (info->references != 0) {
3519 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3520 (unsigned int)info->references));
3521 return -1;
3524 iface = ctdb_find_iface(ctdb, info->name);
3525 if (iface == NULL) {
3526 return -1;
3529 if (link_up == iface->link_up) {
3530 return 0;
3533 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3534 ("iface[%s] has changed it's link status %s => %s\n",
3535 iface->name,
3536 iface->link_up?"up":"down",
3537 link_up?"up":"down"));
3539 iface->link_up = link_up;
3540 return 0;
3545 structure containing the listening socket and the list of tcp connections
3546 that the ctdb daemon is to kill
3548 struct ctdb_kill_tcp {
3549 struct ctdb_vnn *vnn;
3550 struct ctdb_context *ctdb;
3551 int capture_fd;
3552 struct fd_event *fde;
3553 trbt_tree_t *connections;
3554 void *private_data;
3558 a tcp connection that is to be killed
3560 struct ctdb_killtcp_con {
3561 ctdb_sock_addr src_addr;
3562 ctdb_sock_addr dst_addr;
3563 int count;
3564 struct ctdb_kill_tcp *killtcp;
3567 /* this function is used to create a key to represent this socketpair
3568 in the killtcp tree.
3569 this key is used to insert and lookup matching socketpairs that are
3570 to be tickled and RST
3572 #define KILLTCP_KEYLEN 10
3573 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3575 static uint32_t key[KILLTCP_KEYLEN];
3577 bzero(key, sizeof(key));
3579 if (src->sa.sa_family != dst->sa.sa_family) {
3580 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3581 return key;
3584 switch (src->sa.sa_family) {
3585 case AF_INET:
3586 key[0] = dst->ip.sin_addr.s_addr;
3587 key[1] = src->ip.sin_addr.s_addr;
3588 key[2] = dst->ip.sin_port;
3589 key[3] = src->ip.sin_port;
3590 break;
3591 case AF_INET6: {
3592 uint32_t *dst6_addr32 =
3593 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3594 uint32_t *src6_addr32 =
3595 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3596 key[0] = dst6_addr32[3];
3597 key[1] = src6_addr32[3];
3598 key[2] = dst6_addr32[2];
3599 key[3] = src6_addr32[2];
3600 key[4] = dst6_addr32[1];
3601 key[5] = src6_addr32[1];
3602 key[6] = dst6_addr32[0];
3603 key[7] = src6_addr32[0];
3604 key[8] = dst->ip6.sin6_port;
3605 key[9] = src->ip6.sin6_port;
3606 break;
3608 default:
3609 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3610 return key;
3613 return key;
3617 called when we get a read event on the raw socket
3619 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
3620 uint16_t flags, void *private_data)
3622 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3623 struct ctdb_killtcp_con *con;
3624 ctdb_sock_addr src, dst;
3625 uint32_t ack_seq, seq;
3627 if (!(flags & EVENT_FD_READ)) {
3628 return;
3631 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3632 killtcp->private_data,
3633 &src, &dst,
3634 &ack_seq, &seq) != 0) {
3635 /* probably a non-tcp ACK packet */
3636 return;
3639 /* check if we have this guy in our list of connections
3640 to kill
3642 con = trbt_lookuparray32(killtcp->connections,
3643 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3644 if (con == NULL) {
3645 /* no this was some other packet we can just ignore */
3646 return;
3649 /* This one has been tickled !
3650 now reset him and remove him from the list.
3652 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3653 ntohs(con->dst_addr.ip.sin_port),
3654 ctdb_addr_to_str(&con->src_addr),
3655 ntohs(con->src_addr.ip.sin_port)));
3657 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3658 talloc_free(con);
3662 /* when traversing the list of all tcp connections to send tickle acks to
3663 (so that we can capture the ack coming back and kill the connection
3664 by a RST)
3665 this callback is called for each connection we are currently trying to kill
3667 static int tickle_connection_traverse(void *param, void *data)
3669 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3671 /* have tried too many times, just give up */
3672 if (con->count >= 5) {
3673 /* can't delete in traverse: reparent to delete_cons */
3674 talloc_steal(param, con);
3675 return 0;
3678 /* othervise, try tickling it again */
3679 con->count++;
3680 ctdb_sys_send_tcp(
3681 (ctdb_sock_addr *)&con->dst_addr,
3682 (ctdb_sock_addr *)&con->src_addr,
3683 0, 0, 0);
3684 return 0;
3689 called every second until all sentenced connections have been reset
3691 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
3692 struct timeval t, void *private_data)
3694 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3695 void *delete_cons = talloc_new(NULL);
3697 /* loop over all connections sending tickle ACKs */
3698 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3700 /* now we've finished traverse, it's safe to do deletion. */
3701 talloc_free(delete_cons);
3703 /* If there are no more connections to kill we can remove the
3704 entire killtcp structure
3706 if ( (killtcp->connections == NULL) ||
3707 (killtcp->connections->root == NULL) ) {
3708 talloc_free(killtcp);
3709 return;
3712 /* try tickling them again in a seconds time
3714 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3715 ctdb_tickle_sentenced_connections, killtcp);
3719 destroy the killtcp structure
3721 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3723 struct ctdb_vnn *tmpvnn;
3725 /* verify that this vnn is still active */
3726 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3727 if (tmpvnn == killtcp->vnn) {
3728 break;
3732 if (tmpvnn == NULL) {
3733 return 0;
3736 if (killtcp->vnn->killtcp != killtcp) {
3737 return 0;
3740 killtcp->vnn->killtcp = NULL;
3742 return 0;
3746 /* nothing fancy here, just unconditionally replace any existing
3747 connection structure with the new one.
3749 dont even free the old one if it did exist, that one is talloc_stolen
3750 by the same node in the tree anyway and will be deleted when the new data
3751 is deleted
3753 static void *add_killtcp_callback(void *parm, void *data)
3755 return parm;
3759 add a tcp socket to the list of connections we want to RST
3761 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3762 ctdb_sock_addr *s,
3763 ctdb_sock_addr *d)
3765 ctdb_sock_addr src, dst;
3766 struct ctdb_kill_tcp *killtcp;
3767 struct ctdb_killtcp_con *con;
3768 struct ctdb_vnn *vnn;
3770 ctdb_canonicalize_ip(s, &src);
3771 ctdb_canonicalize_ip(d, &dst);
3773 vnn = find_public_ip_vnn(ctdb, &dst);
3774 if (vnn == NULL) {
3775 vnn = find_public_ip_vnn(ctdb, &src);
3777 if (vnn == NULL) {
3778 /* if it is not a public ip it could be our 'single ip' */
3779 if (ctdb->single_ip_vnn) {
3780 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3781 vnn = ctdb->single_ip_vnn;
3785 if (vnn == NULL) {
3786 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3787 return -1;
3790 killtcp = vnn->killtcp;
3792 /* If this is the first connection to kill we must allocate
3793 a new structure
3795 if (killtcp == NULL) {
3796 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3797 CTDB_NO_MEMORY(ctdb, killtcp);
3799 killtcp->vnn = vnn;
3800 killtcp->ctdb = ctdb;
3801 killtcp->capture_fd = -1;
3802 killtcp->connections = trbt_create(killtcp, 0);
3804 vnn->killtcp = killtcp;
3805 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3810 /* create a structure that describes this connection we want to
3811 RST and store it in killtcp->connections
3813 con = talloc(killtcp, struct ctdb_killtcp_con);
3814 CTDB_NO_MEMORY(ctdb, con);
3815 con->src_addr = src;
3816 con->dst_addr = dst;
3817 con->count = 0;
3818 con->killtcp = killtcp;
3821 trbt_insertarray32_callback(killtcp->connections,
3822 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3823 add_killtcp_callback, con);
3826 If we dont have a socket to listen on yet we must create it
3828 if (killtcp->capture_fd == -1) {
3829 const char *iface = ctdb_vnn_iface_string(vnn);
3830 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3831 if (killtcp->capture_fd == -1) {
3832 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3833 "socket on iface '%s' for killtcp (%s)\n",
3834 iface, strerror(errno)));
3835 goto failed;
3840 if (killtcp->fde == NULL) {
3841 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
3842 EVENT_FD_READ,
3843 capture_tcp_handler, killtcp);
3844 tevent_fd_set_auto_close(killtcp->fde);
3846 /* We also need to set up some events to tickle all these connections
3847 until they are all reset
3849 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3850 ctdb_tickle_sentenced_connections, killtcp);
3853 /* tickle him once now */
3854 ctdb_sys_send_tcp(
3855 &con->dst_addr,
3856 &con->src_addr,
3857 0, 0, 0);
3859 return 0;
3861 failed:
3862 talloc_free(vnn->killtcp);
3863 vnn->killtcp = NULL;
3864 return -1;
3868 kill a TCP connection.
3870 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3872 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3874 return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3878 called by a daemon to inform us of the entire list of TCP tickles for
3879 a particular public address.
3880 this control should only be sent by the node that is currently serving
3881 that public address.
3883 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3885 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3886 struct ctdb_tcp_array *tcparray;
3887 struct ctdb_vnn *vnn;
3889 /* We must at least have tickles.num or else we cant verify the size
3890 of the received data blob
3892 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3893 tickles.connections)) {
3894 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3895 return -1;
3898 /* verify that the size of data matches what we expect */
3899 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3900 tickles.connections)
3901 + sizeof(struct ctdb_tcp_connection)
3902 * list->tickles.num) {
3903 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3904 return -1;
3907 vnn = find_public_ip_vnn(ctdb, &list->addr);
3908 if (vnn == NULL) {
3909 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3910 ctdb_addr_to_str(&list->addr)));
3912 return 1;
3915 /* remove any old ticklelist we might have */
3916 talloc_free(vnn->tcp_array);
3917 vnn->tcp_array = NULL;
3919 tcparray = talloc(vnn, struct ctdb_tcp_array);
3920 CTDB_NO_MEMORY(ctdb, tcparray);
3922 tcparray->num = list->tickles.num;
3924 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3925 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3927 memcpy(tcparray->connections, &list->tickles.connections[0],
3928 sizeof(struct ctdb_tcp_connection)*tcparray->num);
3930 /* We now have a new fresh tickle list array for this vnn */
3931 vnn->tcp_array = tcparray;
3933 return 0;
3937 called to return the full list of tickles for the puclic address associated
3938 with the provided vnn
3940 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3942 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3943 struct ctdb_control_tcp_tickle_list *list;
3944 struct ctdb_tcp_array *tcparray;
3945 int num;
3946 struct ctdb_vnn *vnn;
3948 vnn = find_public_ip_vnn(ctdb, addr);
3949 if (vnn == NULL) {
3950 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3951 ctdb_addr_to_str(addr)));
3953 return 1;
3956 tcparray = vnn->tcp_array;
3957 if (tcparray) {
3958 num = tcparray->num;
3959 } else {
3960 num = 0;
3963 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3964 tickles.connections)
3965 + sizeof(struct ctdb_tcp_connection) * num;
3967 outdata->dptr = talloc_size(outdata, outdata->dsize);
3968 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3969 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3971 list->addr = *addr;
3972 list->tickles.num = num;
3973 if (num) {
3974 memcpy(&list->tickles.connections[0], tcparray->connections,
3975 sizeof(struct ctdb_tcp_connection) * num);
3978 return 0;
3983 set the list of all tcp tickles for a public address
3985 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3986 ctdb_sock_addr *addr,
3987 struct ctdb_tcp_array *tcparray)
3989 int ret, num;
3990 TDB_DATA data;
3991 struct ctdb_control_tcp_tickle_list *list;
3993 if (tcparray) {
3994 num = tcparray->num;
3995 } else {
3996 num = 0;
3999 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
4000 tickles.connections) +
4001 sizeof(struct ctdb_tcp_connection) * num;
4002 data.dptr = talloc_size(ctdb, data.dsize);
4003 CTDB_NO_MEMORY(ctdb, data.dptr);
4005 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
4006 list->addr = *addr;
4007 list->tickles.num = num;
4008 if (tcparray) {
4009 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
4012 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
4013 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
4014 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
4015 if (ret != 0) {
4016 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
4017 return -1;
4020 talloc_free(data.dptr);
4022 return ret;
4027 perform tickle updates if required
4029 static void ctdb_update_tcp_tickles(struct event_context *ev,
4030 struct timed_event *te,
4031 struct timeval t, void *private_data)
4033 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4034 int ret;
4035 struct ctdb_vnn *vnn;
4037 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4038 /* we only send out updates for public addresses that
4039 we have taken over
4041 if (ctdb->pnn != vnn->pnn) {
4042 continue;
4044 /* We only send out the updates if we need to */
4045 if (!vnn->tcp_update_needed) {
4046 continue;
4048 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
4049 &vnn->public_address,
4050 vnn->tcp_array);
4051 if (ret != 0) {
4052 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
4053 ctdb_addr_to_str(&vnn->public_address)));
4054 } else {
4055 vnn->tcp_update_needed = false;
4059 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4060 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
4061 ctdb_update_tcp_tickles, ctdb);
4066 start periodic update of tcp tickles
4068 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
4070 ctdb->tickle_update_context = talloc_new(ctdb);
4072 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
4073 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
4074 ctdb_update_tcp_tickles, ctdb);
4080 struct control_gratious_arp {
4081 struct ctdb_context *ctdb;
4082 ctdb_sock_addr addr;
4083 const char *iface;
4084 int count;
4088 send a control_gratuitous arp
4090 static void send_gratious_arp(struct event_context *ev, struct timed_event *te,
4091 struct timeval t, void *private_data)
4093 int ret;
4094 struct control_gratious_arp *arp = talloc_get_type(private_data,
4095 struct control_gratious_arp);
4097 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
4098 if (ret != 0) {
4099 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
4100 arp->iface, strerror(errno)));
4104 arp->count++;
4105 if (arp->count == CTDB_ARP_REPEAT) {
4106 talloc_free(arp);
4107 return;
4110 event_add_timed(arp->ctdb->ev, arp,
4111 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
4112 send_gratious_arp, arp);
4117 send a gratious arp
4119 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
4121 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
4122 struct control_gratious_arp *arp;
4124 /* verify the size of indata */
4125 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
4126 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
4127 (unsigned)indata.dsize,
4128 (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
4129 return -1;
4131 if (indata.dsize !=
4132 ( offsetof(struct ctdb_control_gratious_arp, iface)
4133 + gratious_arp->len ) ){
4135 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4136 "but should be %u bytes\n",
4137 (unsigned)indata.dsize,
4138 (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
4139 return -1;
4143 arp = talloc(ctdb, struct control_gratious_arp);
4144 CTDB_NO_MEMORY(ctdb, arp);
4146 arp->ctdb = ctdb;
4147 arp->addr = gratious_arp->addr;
4148 arp->iface = talloc_strdup(arp, gratious_arp->iface);
4149 CTDB_NO_MEMORY(ctdb, arp->iface);
4150 arp->count = 0;
4152 event_add_timed(arp->ctdb->ev, arp,
4153 timeval_zero(), send_gratious_arp, arp);
4155 return 0;
4158 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4160 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4161 int ret;
4163 /* verify the size of indata */
4164 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4165 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4166 return -1;
4168 if (indata.dsize !=
4169 ( offsetof(struct ctdb_control_ip_iface, iface)
4170 + pub->len ) ){
4172 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4173 "but should be %u bytes\n",
4174 (unsigned)indata.dsize,
4175 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4176 return -1;
4179 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4181 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4183 if (ret != 0) {
4184 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4185 return -1;
4188 return 0;
4191 struct delete_ip_callback_state {
4192 struct ctdb_req_control *c;
4196 called when releaseip event finishes for del_public_address
4198 static void delete_ip_callback(struct ctdb_context *ctdb,
4199 int32_t status, TDB_DATA data,
4200 const char *errormsg,
4201 void *private_data)
4203 struct delete_ip_callback_state *state =
4204 talloc_get_type(private_data, struct delete_ip_callback_state);
4206 /* If release failed then fail. */
4207 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4208 talloc_free(private_data);
4211 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4212 struct ctdb_req_control *c,
4213 TDB_DATA indata, bool *async_reply)
4215 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4216 struct ctdb_vnn *vnn;
4218 /* verify the size of indata */
4219 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4220 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4221 return -1;
4223 if (indata.dsize !=
4224 ( offsetof(struct ctdb_control_ip_iface, iface)
4225 + pub->len ) ){
4227 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4228 "but should be %u bytes\n",
4229 (unsigned)indata.dsize,
4230 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4231 return -1;
4234 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4236 /* walk over all public addresses until we find a match */
4237 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4238 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4239 if (vnn->pnn == ctdb->pnn) {
4240 struct delete_ip_callback_state *state;
4241 struct ctdb_public_ip *ip;
4242 TDB_DATA data;
4243 int ret;
4245 vnn->delete_pending = true;
4247 state = talloc(ctdb,
4248 struct delete_ip_callback_state);
4249 CTDB_NO_MEMORY(ctdb, state);
4250 state->c = c;
4252 ip = talloc(state, struct ctdb_public_ip);
4253 if (ip == NULL) {
4254 DEBUG(DEBUG_ERR,
4255 (__location__ " Out of memory\n"));
4256 talloc_free(state);
4257 return -1;
4259 ip->pnn = -1;
4260 ip->addr = pub->addr;
4262 data.dsize = sizeof(struct ctdb_public_ip);
4263 data.dptr = (unsigned char *)ip;
4265 ret = ctdb_daemon_send_control(ctdb,
4266 ctdb_get_pnn(ctdb),
4268 CTDB_CONTROL_RELEASE_IP,
4269 0, 0,
4270 data,
4271 delete_ip_callback,
4272 state);
4273 if (ret == -1) {
4274 DEBUG(DEBUG_ERR,
4275 (__location__ "Unable to send "
4276 "CTDB_CONTROL_RELEASE_IP\n"));
4277 talloc_free(state);
4278 return -1;
4281 state->c = talloc_steal(state, c);
4282 *async_reply = true;
4283 } else {
4284 /* This IP is not hosted on the
4285 * current node so just delete it
4286 * now. */
4287 do_delete_ip(ctdb, vnn);
4290 return 0;
4294 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4295 ctdb_addr_to_str(&pub->addr)));
4296 return -1;
4300 struct ipreallocated_callback_state {
4301 struct ctdb_req_control *c;
4304 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4305 int status, void *p)
4307 struct ipreallocated_callback_state *state =
4308 talloc_get_type(p, struct ipreallocated_callback_state);
4310 if (status != 0) {
4311 DEBUG(DEBUG_ERR,
4312 (" \"ipreallocated\" event script failed (status %d)\n",
4313 status));
4314 if (status == -ETIME) {
4315 ctdb_ban_self(ctdb);
4319 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4320 talloc_free(state);
4323 /* A control to run the ipreallocated event */
4324 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4325 struct ctdb_req_control *c,
4326 bool *async_reply)
4328 int ret;
4329 struct ipreallocated_callback_state *state;
4331 state = talloc(ctdb, struct ipreallocated_callback_state);
4332 CTDB_NO_MEMORY(ctdb, state);
4334 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4336 ret = ctdb_event_script_callback(ctdb, state,
4337 ctdb_ipreallocated_callback, state,
4338 CTDB_EVENT_IPREALLOCATED,
4339 "%s", "");
4341 if (ret != 0) {
4342 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4343 talloc_free(state);
4344 return -1;
4347 /* tell the control that we will be reply asynchronously */
4348 state->c = talloc_steal(state, c);
4349 *async_reply = true;
4351 return 0;
4355 /* This function is called from the recovery daemon to verify that a remote
4356 node has the expected ip allocation.
4357 This is verified against ctdb->ip_tree
4359 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4360 struct ctdb_all_public_ips *ips,
4361 uint32_t pnn)
4363 struct ctdb_public_ip_list *tmp_ip;
4364 int i;
4366 if (ctdb->ip_tree == NULL) {
4367 /* dont know the expected allocation yet, assume remote node
4368 is correct. */
4369 return 0;
4372 if (ips == NULL) {
4373 return 0;
4376 for (i=0; i<ips->num; i++) {
4377 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4378 if (tmp_ip == NULL) {
4379 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4380 return -1;
4383 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4384 continue;
4387 if (tmp_ip->pnn != ips->ips[i].pnn) {
4388 DEBUG(DEBUG_ERR,
4389 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4390 pnn,
4391 ctdb_addr_to_str(&ips->ips[i].addr),
4392 ips->ips[i].pnn, tmp_ip->pnn));
4393 return -1;
4397 return 0;
4400 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4402 struct ctdb_public_ip_list *tmp_ip;
4404 if (ctdb->ip_tree == NULL) {
4405 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4406 return -1;
4409 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4410 if (tmp_ip == NULL) {
4411 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4412 return -1;
4415 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4416 tmp_ip->pnn = ip->pnn;
4418 return 0;
4422 struct ctdb_reloadips_handle {
4423 struct ctdb_context *ctdb;
4424 struct ctdb_req_control *c;
4425 int status;
4426 int fd[2];
4427 pid_t child;
4428 struct fd_event *fde;
4431 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4433 if (h == h->ctdb->reload_ips) {
4434 h->ctdb->reload_ips = NULL;
4436 if (h->c != NULL) {
4437 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4438 h->c = NULL;
4440 ctdb_kill(h->ctdb, h->child, SIGKILL);
4441 return 0;
4444 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4445 struct timed_event *te,
4446 struct timeval t, void *private_data)
4448 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4450 talloc_free(h);
4453 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde,
4454 uint16_t flags, void *private_data)
4456 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4458 char res;
4459 int ret;
4461 ret = read(h->fd[0], &res, 1);
4462 if (ret < 1 || res != 0) {
4463 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4464 res = 1;
4466 h->status = res;
4468 talloc_free(h);
4471 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4473 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4474 struct ctdb_all_public_ips *ips;
4475 struct ctdb_vnn *vnn;
4476 struct client_async_data *async_data;
4477 struct timeval timeout;
4478 TDB_DATA data;
4479 struct ctdb_client_control_state *state;
4480 bool first_add;
4481 int i, ret;
4483 CTDB_NO_MEMORY(ctdb, mem_ctx);
4485 /* Read IPs from local node */
4486 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4487 CTDB_CURRENT_NODE, mem_ctx, &ips);
4488 if (ret != 0) {
4489 DEBUG(DEBUG_ERR,
4490 ("Unable to fetch public IPs from local node\n"));
4491 talloc_free(mem_ctx);
4492 return -1;
4495 /* Read IPs file - this is safe since this is a child process */
4496 ctdb->vnn = NULL;
4497 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4498 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4499 talloc_free(mem_ctx);
4500 return -1;
4503 async_data = talloc_zero(mem_ctx, struct client_async_data);
4504 CTDB_NO_MEMORY(ctdb, async_data);
4506 /* Compare IPs between node and file for IPs to be deleted */
4507 for (i = 0; i < ips->num; i++) {
4508 /* */
4509 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4510 if (ctdb_same_ip(&vnn->public_address,
4511 &ips->ips[i].addr)) {
4512 /* IP is still in file */
4513 break;
4517 if (vnn == NULL) {
4518 /* Delete IP ips->ips[i] */
4519 struct ctdb_control_ip_iface *pub;
4521 DEBUG(DEBUG_NOTICE,
4522 ("IP %s no longer configured, deleting it\n",
4523 ctdb_addr_to_str(&ips->ips[i].addr)));
4525 pub = talloc_zero(mem_ctx,
4526 struct ctdb_control_ip_iface);
4527 CTDB_NO_MEMORY(ctdb, pub);
4529 pub->addr = ips->ips[i].addr;
4530 pub->mask = 0;
4531 pub->len = 0;
4533 timeout = TAKEOVER_TIMEOUT();
4535 data.dsize = offsetof(struct ctdb_control_ip_iface,
4536 iface) + pub->len;
4537 data.dptr = (uint8_t *)pub;
4539 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4540 CTDB_CONTROL_DEL_PUBLIC_IP,
4541 0, data, async_data,
4542 &timeout, NULL);
4543 if (state == NULL) {
4544 DEBUG(DEBUG_ERR,
4545 (__location__
4546 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4547 goto failed;
4550 ctdb_client_async_add(async_data, state);
4554 /* Compare IPs between node and file for IPs to be added */
4555 first_add = true;
4556 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4557 for (i = 0; i < ips->num; i++) {
4558 if (ctdb_same_ip(&vnn->public_address,
4559 &ips->ips[i].addr)) {
4560 /* IP already on node */
4561 break;
4564 if (i == ips->num) {
4565 /* Add IP ips->ips[i] */
4566 struct ctdb_control_ip_iface *pub;
4567 const char *ifaces = NULL;
4568 uint32_t len;
4569 int iface = 0;
4571 DEBUG(DEBUG_NOTICE,
4572 ("New IP %s configured, adding it\n",
4573 ctdb_addr_to_str(&vnn->public_address)));
4574 if (first_add) {
4575 uint32_t pnn = ctdb_get_pnn(ctdb);
4577 data.dsize = sizeof(pnn);
4578 data.dptr = (uint8_t *)&pnn;
4580 ret = ctdb_client_send_message(
4581 ctdb,
4582 CTDB_BROADCAST_CONNECTED,
4583 CTDB_SRVID_REBALANCE_NODE,
4584 data);
4585 if (ret != 0) {
4586 DEBUG(DEBUG_WARNING,
4587 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4590 first_add = false;
4593 ifaces = vnn->ifaces[0];
4594 iface = 1;
4595 while (vnn->ifaces[iface] != NULL) {
4596 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4597 vnn->ifaces[iface]);
4598 iface++;
4601 len = strlen(ifaces) + 1;
4602 pub = talloc_zero_size(mem_ctx,
4603 offsetof(struct ctdb_control_ip_iface, iface) + len);
4604 CTDB_NO_MEMORY(ctdb, pub);
4606 pub->addr = vnn->public_address;
4607 pub->mask = vnn->public_netmask_bits;
4608 pub->len = len;
4609 memcpy(&pub->iface[0], ifaces, pub->len);
4611 timeout = TAKEOVER_TIMEOUT();
4613 data.dsize = offsetof(struct ctdb_control_ip_iface,
4614 iface) + pub->len;
4615 data.dptr = (uint8_t *)pub;
4617 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4618 CTDB_CONTROL_ADD_PUBLIC_IP,
4619 0, data, async_data,
4620 &timeout, NULL);
4621 if (state == NULL) {
4622 DEBUG(DEBUG_ERR,
4623 (__location__
4624 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4625 goto failed;
4628 ctdb_client_async_add(async_data, state);
4632 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4633 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4634 goto failed;
4637 talloc_free(mem_ctx);
4638 return 0;
4640 failed:
4641 talloc_free(mem_ctx);
4642 return -1;
4645 /* This control is sent to force the node to re-read the public addresses file
4646 and drop any addresses we should nnot longer host, and add new addresses
4647 that we are now able to host
4649 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4651 struct ctdb_reloadips_handle *h;
4652 pid_t parent = getpid();
4654 if (ctdb->reload_ips != NULL) {
4655 talloc_free(ctdb->reload_ips);
4656 ctdb->reload_ips = NULL;
4659 h = talloc(ctdb, struct ctdb_reloadips_handle);
4660 CTDB_NO_MEMORY(ctdb, h);
4661 h->ctdb = ctdb;
4662 h->c = NULL;
4663 h->status = -1;
4665 if (pipe(h->fd) == -1) {
4666 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4667 talloc_free(h);
4668 return -1;
4671 h->child = ctdb_fork(ctdb);
4672 if (h->child == (pid_t)-1) {
4673 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4674 close(h->fd[0]);
4675 close(h->fd[1]);
4676 talloc_free(h);
4677 return -1;
4680 /* child process */
4681 if (h->child == 0) {
4682 signed char res = 0;
4684 close(h->fd[0]);
4685 debug_extra = talloc_asprintf(NULL, "reloadips:");
4687 ctdb_set_process_name("ctdb_reloadips");
4688 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4689 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4690 res = -1;
4691 } else {
4692 res = ctdb_reloadips_child(ctdb);
4693 if (res != 0) {
4694 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4698 write(h->fd[1], &res, 1);
4699 /* make sure we die when our parent dies */
4700 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4701 sleep(5);
4703 _exit(0);
4706 h->c = talloc_steal(h, c);
4708 close(h->fd[1]);
4709 set_close_on_exec(h->fd[0]);
4711 talloc_set_destructor(h, ctdb_reloadips_destructor);
4714 h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4715 EVENT_FD_READ, ctdb_reloadips_child_handler,
4716 (void *)h);
4717 tevent_fd_set_auto_close(h->fde);
4719 event_add_timed(ctdb->ev, h,
4720 timeval_current_ofs(120, 0),
4721 ctdb_reloadips_timeout_event, h);
4723 /* we reply later */
4724 *async_reply = true;
4725 return 0;