replace: Fix bug 11455
[Samba.git] / ctdb / server / ctdb_takeover.c
blobefc80b1ab8c36a7495c90a549ded23db45c7f10d
1 /*
2 ctdb ip takeover code
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "includes.h"
22 #include "tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38 bool noiptakeover;
39 bool noiphost;
40 enum ctdb_runstate runstate;
43 struct ctdb_iface {
44 struct ctdb_iface *prev, *next;
45 const char *name;
46 bool link_up;
47 uint32_t references;
50 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
52 if (vnn->iface) {
53 return vnn->iface->name;
56 return "__none__";
59 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
61 struct ctdb_iface *i;
63 /* Verify that we dont have an entry for this ip yet */
64 for (i=ctdb->ifaces;i;i=i->next) {
65 if (strcmp(i->name, iface) == 0) {
66 return 0;
70 /* create a new structure for this interface */
71 i = talloc_zero(ctdb, struct ctdb_iface);
72 CTDB_NO_MEMORY_FATAL(ctdb, i);
73 i->name = talloc_strdup(i, iface);
74 CTDB_NO_MEMORY(ctdb, i->name);
76 i->link_up = true;
78 DLIST_ADD(ctdb->ifaces, i);
80 return 0;
83 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
84 const char *name)
86 int n;
88 for (n = 0; vnn->ifaces[n] != NULL; n++) {
89 if (strcmp(name, vnn->ifaces[n]) == 0) {
90 return true;
94 return false;
97 /* If any interfaces now have no possible IPs then delete them. This
98 * implementation is naive (i.e. simple) rather than clever
99 * (i.e. complex). Given that this is run on delip and that operation
100 * is rare, this doesn't need to be efficient - it needs to be
101 * foolproof. One alternative is reference counting, where the logic
102 * is distributed and can, therefore, be broken in multiple places.
103 * Another alternative is to build a red-black tree of interfaces that
104 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
105 * once) and then walking ctdb->ifaces once and deleting those not in
106 * the tree. Let's go to one of those if the naive implementation
107 * causes problems... :-)
109 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
110 struct ctdb_vnn *vnn)
112 struct ctdb_iface *i, *next;
114 /* For each interface, check if there's an IP using it. */
115 for (i = ctdb->ifaces; i != NULL; i = next) {
116 struct ctdb_vnn *tv;
117 bool found;
118 next = i->next;
120 /* Only consider interfaces named in the given VNN. */
121 if (!vnn_has_interface_with_name(vnn, i->name)) {
122 continue;
125 /* Is the "single IP" on this interface? */
126 if ((ctdb->single_ip_vnn != NULL) &&
127 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
128 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
129 /* Found, next interface please... */
130 continue;
132 /* Search for a vnn with this interface. */
133 found = false;
134 for (tv=ctdb->vnn; tv; tv=tv->next) {
135 if (vnn_has_interface_with_name(tv, i->name)) {
136 found = true;
137 break;
141 if (!found) {
142 /* None of the VNNs are using this interface. */
143 DLIST_REMOVE(ctdb->ifaces, i);
144 talloc_free(i);
150 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
151 const char *iface)
153 struct ctdb_iface *i;
155 for (i=ctdb->ifaces;i;i=i->next) {
156 if (strcmp(i->name, iface) == 0) {
157 return i;
161 return NULL;
164 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
165 struct ctdb_vnn *vnn)
167 int i;
168 struct ctdb_iface *cur = NULL;
169 struct ctdb_iface *best = NULL;
171 for (i=0; vnn->ifaces[i]; i++) {
173 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
174 if (cur == NULL) {
175 continue;
178 if (!cur->link_up) {
179 continue;
182 if (best == NULL) {
183 best = cur;
184 continue;
187 if (cur->references < best->references) {
188 best = cur;
189 continue;
193 return best;
196 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
197 struct ctdb_vnn *vnn)
199 struct ctdb_iface *best = NULL;
201 if (vnn->iface) {
202 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
203 "still assigned to iface '%s'\n",
204 ctdb_addr_to_str(&vnn->public_address),
205 ctdb_vnn_iface_string(vnn)));
206 return 0;
209 best = ctdb_vnn_best_iface(ctdb, vnn);
210 if (best == NULL) {
211 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
212 "cannot assign to iface any iface\n",
213 ctdb_addr_to_str(&vnn->public_address)));
214 return -1;
217 vnn->iface = best;
218 best->references++;
219 vnn->pnn = ctdb->pnn;
221 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
222 "now assigned to iface '%s' refs[%d]\n",
223 ctdb_addr_to_str(&vnn->public_address),
224 ctdb_vnn_iface_string(vnn),
225 best->references));
226 return 0;
229 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
230 struct ctdb_vnn *vnn)
232 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
233 "now unassigned (old iface '%s' refs[%d])\n",
234 ctdb_addr_to_str(&vnn->public_address),
235 ctdb_vnn_iface_string(vnn),
236 vnn->iface?vnn->iface->references:0));
237 if (vnn->iface) {
238 vnn->iface->references--;
240 vnn->iface = NULL;
241 if (vnn->pnn == ctdb->pnn) {
242 vnn->pnn = -1;
246 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
247 struct ctdb_vnn *vnn)
249 int i;
251 if (vnn->delete_pending) {
252 return false;
255 if (vnn->iface && vnn->iface->link_up) {
256 return true;
259 for (i=0; vnn->ifaces[i]; i++) {
260 struct ctdb_iface *cur;
262 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
263 if (cur == NULL) {
264 continue;
267 if (cur->link_up) {
268 return true;
272 return false;
275 struct ctdb_takeover_arp {
276 struct ctdb_context *ctdb;
277 uint32_t count;
278 ctdb_sock_addr addr;
279 struct ctdb_tcp_array *tcparray;
280 struct ctdb_vnn *vnn;
285 lists of tcp endpoints
287 struct ctdb_tcp_list {
288 struct ctdb_tcp_list *prev, *next;
289 struct ctdb_tcp_connection connection;
293 list of clients to kill on IP release
295 struct ctdb_client_ip {
296 struct ctdb_client_ip *prev, *next;
297 struct ctdb_context *ctdb;
298 ctdb_sock_addr addr;
299 uint32_t client_id;
304 send a gratuitous arp
306 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
307 struct timeval t, void *private_data)
309 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
310 struct ctdb_takeover_arp);
311 int i, ret;
312 struct ctdb_tcp_array *tcparray;
313 const char *iface = ctdb_vnn_iface_string(arp->vnn);
315 ret = ctdb_sys_send_arp(&arp->addr, iface);
316 if (ret != 0) {
317 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
318 iface, strerror(errno)));
321 tcparray = arp->tcparray;
322 if (tcparray) {
323 for (i=0;i<tcparray->num;i++) {
324 struct ctdb_tcp_connection *tcon;
326 tcon = &tcparray->connections[i];
327 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
328 (unsigned)ntohs(tcon->dst_addr.ip.sin_port),
329 ctdb_addr_to_str(&tcon->src_addr),
330 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
331 ret = ctdb_sys_send_tcp(
332 &tcon->src_addr,
333 &tcon->dst_addr,
334 0, 0, 0);
335 if (ret != 0) {
336 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
337 ctdb_addr_to_str(&tcon->src_addr)));
342 arp->count++;
344 if (arp->count == CTDB_ARP_REPEAT) {
345 talloc_free(arp);
346 return;
349 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
350 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
351 ctdb_control_send_arp, arp);
354 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
355 struct ctdb_vnn *vnn)
357 struct ctdb_takeover_arp *arp;
358 struct ctdb_tcp_array *tcparray;
360 if (!vnn->takeover_ctx) {
361 vnn->takeover_ctx = talloc_new(vnn);
362 if (!vnn->takeover_ctx) {
363 return -1;
367 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
368 if (!arp) {
369 return -1;
372 arp->ctdb = ctdb;
373 arp->addr = vnn->public_address;
374 arp->vnn = vnn;
376 tcparray = vnn->tcp_array;
377 if (tcparray) {
378 /* add all of the known tcp connections for this IP to the
379 list of tcp connections to send tickle acks for */
380 arp->tcparray = talloc_steal(arp, tcparray);
382 vnn->tcp_array = NULL;
383 vnn->tcp_update_needed = true;
386 event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
387 timeval_zero(), ctdb_control_send_arp, arp);
389 return 0;
392 struct takeover_callback_state {
393 struct ctdb_req_control *c;
394 ctdb_sock_addr *addr;
395 struct ctdb_vnn *vnn;
398 struct ctdb_do_takeip_state {
399 struct ctdb_req_control *c;
400 struct ctdb_vnn *vnn;
404 called when takeip event finishes
406 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
407 void *private_data)
409 struct ctdb_do_takeip_state *state =
410 talloc_get_type(private_data, struct ctdb_do_takeip_state);
411 int32_t ret;
412 TDB_DATA data;
414 if (status != 0) {
415 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
417 if (status == -ETIME) {
418 ctdb_ban_self(ctdb);
420 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
421 ctdb_addr_to_str(&state->vnn->public_address),
422 ctdb_vnn_iface_string(state->vnn)));
423 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
425 node->flags |= NODE_FLAGS_UNHEALTHY;
426 talloc_free(state);
427 return;
430 if (ctdb->do_checkpublicip) {
432 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
433 if (ret != 0) {
434 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
435 talloc_free(state);
436 return;
441 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
442 data.dsize = strlen((char *)data.dptr) + 1;
443 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
445 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
448 /* the control succeeded */
449 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
450 talloc_free(state);
451 return;
454 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
456 state->vnn->update_in_flight = false;
457 return 0;
461 take over an ip address
463 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
464 struct ctdb_req_control *c,
465 struct ctdb_vnn *vnn)
467 int ret;
468 struct ctdb_do_takeip_state *state;
470 if (vnn->update_in_flight) {
471 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
472 "update for this IP already in flight\n",
473 ctdb_addr_to_str(&vnn->public_address),
474 vnn->public_netmask_bits));
475 return -1;
478 ret = ctdb_vnn_assign_iface(ctdb, vnn);
479 if (ret != 0) {
480 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
481 "assign a usable interface\n",
482 ctdb_addr_to_str(&vnn->public_address),
483 vnn->public_netmask_bits));
484 return -1;
487 state = talloc(vnn, struct ctdb_do_takeip_state);
488 CTDB_NO_MEMORY(ctdb, state);
490 state->c = talloc_steal(ctdb, c);
491 state->vnn = vnn;
493 vnn->update_in_flight = true;
494 talloc_set_destructor(state, ctdb_takeip_destructor);
496 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
497 ctdb_addr_to_str(&vnn->public_address),
498 vnn->public_netmask_bits,
499 ctdb_vnn_iface_string(vnn)));
501 ret = ctdb_event_script_callback(ctdb,
502 state,
503 ctdb_do_takeip_callback,
504 state,
505 CTDB_EVENT_TAKE_IP,
506 "%s %s %u",
507 ctdb_vnn_iface_string(vnn),
508 ctdb_addr_to_str(&vnn->public_address),
509 vnn->public_netmask_bits);
511 if (ret != 0) {
512 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
513 ctdb_addr_to_str(&vnn->public_address),
514 ctdb_vnn_iface_string(vnn)));
515 talloc_free(state);
516 return -1;
519 return 0;
522 struct ctdb_do_updateip_state {
523 struct ctdb_req_control *c;
524 struct ctdb_iface *old;
525 struct ctdb_vnn *vnn;
529 called when updateip event finishes
531 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
532 void *private_data)
534 struct ctdb_do_updateip_state *state =
535 talloc_get_type(private_data, struct ctdb_do_updateip_state);
536 int32_t ret;
538 if (status != 0) {
539 if (status == -ETIME) {
540 ctdb_ban_self(ctdb);
542 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
543 ctdb_addr_to_str(&state->vnn->public_address),
544 state->old->name,
545 ctdb_vnn_iface_string(state->vnn)));
548 * All we can do is reset the old interface
549 * and let the next run fix it
551 ctdb_vnn_unassign_iface(ctdb, state->vnn);
552 state->vnn->iface = state->old;
553 state->vnn->iface->references++;
555 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
556 talloc_free(state);
557 return;
560 if (ctdb->do_checkpublicip) {
562 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
563 if (ret != 0) {
564 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
565 talloc_free(state);
566 return;
571 /* the control succeeded */
572 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
573 talloc_free(state);
574 return;
577 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
579 state->vnn->update_in_flight = false;
580 return 0;
584 update (move) an ip address
586 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
587 struct ctdb_req_control *c,
588 struct ctdb_vnn *vnn)
590 int ret;
591 struct ctdb_do_updateip_state *state;
592 struct ctdb_iface *old = vnn->iface;
593 const char *new_name;
595 if (vnn->update_in_flight) {
596 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
597 "update for this IP already in flight\n",
598 ctdb_addr_to_str(&vnn->public_address),
599 vnn->public_netmask_bits));
600 return -1;
603 ctdb_vnn_unassign_iface(ctdb, vnn);
604 ret = ctdb_vnn_assign_iface(ctdb, vnn);
605 if (ret != 0) {
606 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
607 "assin a usable interface (old iface '%s')\n",
608 ctdb_addr_to_str(&vnn->public_address),
609 vnn->public_netmask_bits,
610 old->name));
611 return -1;
614 new_name = ctdb_vnn_iface_string(vnn);
615 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
616 /* A benign update from one interface onto itself.
617 * no need to run the eventscripts in this case, just return
618 * success.
620 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
621 return 0;
624 state = talloc(vnn, struct ctdb_do_updateip_state);
625 CTDB_NO_MEMORY(ctdb, state);
627 state->c = talloc_steal(ctdb, c);
628 state->old = old;
629 state->vnn = vnn;
631 vnn->update_in_flight = true;
632 talloc_set_destructor(state, ctdb_updateip_destructor);
634 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
635 "interface %s to %s\n",
636 ctdb_addr_to_str(&vnn->public_address),
637 vnn->public_netmask_bits,
638 old->name,
639 new_name));
641 ret = ctdb_event_script_callback(ctdb,
642 state,
643 ctdb_do_updateip_callback,
644 state,
645 CTDB_EVENT_UPDATE_IP,
646 "%s %s %s %u",
647 state->old->name,
648 new_name,
649 ctdb_addr_to_str(&vnn->public_address),
650 vnn->public_netmask_bits);
651 if (ret != 0) {
652 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
653 ctdb_addr_to_str(&vnn->public_address),
654 old->name, new_name));
655 talloc_free(state);
656 return -1;
659 return 0;
663 Find the vnn of the node that has a public ip address
664 returns -1 if the address is not known as a public address
666 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
668 struct ctdb_vnn *vnn;
670 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
671 if (ctdb_same_ip(&vnn->public_address, addr)) {
672 return vnn;
676 return NULL;
680 take over an ip address
682 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
683 struct ctdb_req_control *c,
684 TDB_DATA indata,
685 bool *async_reply)
687 int ret;
688 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
689 struct ctdb_vnn *vnn;
690 bool have_ip = false;
691 bool do_updateip = false;
692 bool do_takeip = false;
693 struct ctdb_iface *best_iface = NULL;
695 if (pip->pnn != ctdb->pnn) {
696 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
697 "with pnn %d, but we're node %d\n",
698 ctdb_addr_to_str(&pip->addr),
699 pip->pnn, ctdb->pnn));
700 return -1;
703 /* update out vnn list */
704 vnn = find_public_ip_vnn(ctdb, &pip->addr);
705 if (vnn == NULL) {
706 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
707 ctdb_addr_to_str(&pip->addr)));
708 return 0;
711 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
712 have_ip = ctdb_sys_have_ip(&pip->addr);
714 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
715 if (best_iface == NULL) {
716 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
717 "a usable interface (old %s, have_ip %d)\n",
718 ctdb_addr_to_str(&vnn->public_address),
719 vnn->public_netmask_bits,
720 ctdb_vnn_iface_string(vnn),
721 have_ip));
722 return -1;
725 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
726 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
727 have_ip = false;
731 if (vnn->iface == NULL && have_ip) {
732 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
733 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
734 ctdb_addr_to_str(&vnn->public_address)));
735 return 0;
738 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
739 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
740 "and we have it on iface[%s], but it was assigned to node %d"
741 "and we are node %d, banning ourself\n",
742 ctdb_addr_to_str(&vnn->public_address),
743 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
744 ctdb_ban_self(ctdb);
745 return -1;
748 if (vnn->pnn == -1 && have_ip) {
749 vnn->pnn = ctdb->pnn;
750 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
751 "and we already have it on iface[%s], update local daemon\n",
752 ctdb_addr_to_str(&vnn->public_address),
753 ctdb_vnn_iface_string(vnn)));
754 return 0;
757 if (vnn->iface) {
758 if (vnn->iface != best_iface) {
759 if (!vnn->iface->link_up) {
760 do_updateip = true;
761 } else if (vnn->iface->references > (best_iface->references + 1)) {
762 /* only move when the rebalance gains something */
763 do_updateip = true;
768 if (!have_ip) {
769 if (do_updateip) {
770 ctdb_vnn_unassign_iface(ctdb, vnn);
771 do_updateip = false;
773 do_takeip = true;
776 if (do_takeip) {
777 ret = ctdb_do_takeip(ctdb, c, vnn);
778 if (ret != 0) {
779 return -1;
781 } else if (do_updateip) {
782 ret = ctdb_do_updateip(ctdb, c, vnn);
783 if (ret != 0) {
784 return -1;
786 } else {
788 * The interface is up and the kernel known the ip
789 * => do nothing
791 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
792 ctdb_addr_to_str(&pip->addr),
793 vnn->public_netmask_bits,
794 ctdb_vnn_iface_string(vnn)));
795 return 0;
798 /* tell ctdb_control.c that we will be replying asynchronously */
799 *async_reply = true;
801 return 0;
805 kill any clients that are registered with a IP that is being released
807 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
809 struct ctdb_client_ip *ip;
811 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
812 ctdb_addr_to_str(addr)));
814 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
815 ctdb_sock_addr tmp_addr;
817 tmp_addr = ip->addr;
818 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
819 ip->client_id,
820 ctdb_addr_to_str(&ip->addr)));
822 if (ctdb_same_ip(&tmp_addr, addr)) {
823 struct ctdb_client *client = ctdb_reqid_find(ctdb,
824 ip->client_id,
825 struct ctdb_client);
826 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
827 ip->client_id,
828 ctdb_addr_to_str(&ip->addr),
829 client->pid));
831 if (client->pid != 0) {
832 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
833 (unsigned)client->pid,
834 ctdb_addr_to_str(addr),
835 ip->client_id));
836 kill(client->pid, SIGKILL);
842 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
844 DLIST_REMOVE(ctdb->vnn, vnn);
845 ctdb_vnn_unassign_iface(ctdb, vnn);
846 ctdb_remove_orphaned_ifaces(ctdb, vnn);
847 talloc_free(vnn);
851 called when releaseip event finishes
853 static void release_ip_callback(struct ctdb_context *ctdb, int status,
854 void *private_data)
856 struct takeover_callback_state *state =
857 talloc_get_type(private_data, struct takeover_callback_state);
858 TDB_DATA data;
860 if (status == -ETIME) {
861 ctdb_ban_self(ctdb);
864 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
865 if (ctdb_sys_have_ip(state->addr)) {
866 DEBUG(DEBUG_ERR,
867 ("IP %s still hosted during release IP callback, failing\n",
868 ctdb_addr_to_str(state->addr)));
869 ctdb_request_control_reply(ctdb, state->c,
870 NULL, -1, NULL);
871 talloc_free(state);
872 return;
876 /* send a message to all clients of this node telling them
877 that the cluster has been reconfigured and they should
878 release any sockets on this IP */
879 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
880 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
881 data.dsize = strlen((char *)data.dptr)+1;
883 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
885 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
887 /* kill clients that have registered with this IP */
888 release_kill_clients(ctdb, state->addr);
890 ctdb_vnn_unassign_iface(ctdb, state->vnn);
892 /* Process the IP if it has been marked for deletion */
893 if (state->vnn->delete_pending) {
894 do_delete_ip(ctdb, state->vnn);
895 state->vnn = NULL;
898 /* the control succeeded */
899 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
900 talloc_free(state);
903 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
905 if (state->vnn != NULL) {
906 state->vnn->update_in_flight = false;
908 return 0;
912 release an ip address
914 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
915 struct ctdb_req_control *c,
916 TDB_DATA indata,
917 bool *async_reply)
919 int ret;
920 struct takeover_callback_state *state;
921 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
922 struct ctdb_vnn *vnn;
923 char *iface;
925 /* update our vnn list */
926 vnn = find_public_ip_vnn(ctdb, &pip->addr);
927 if (vnn == NULL) {
928 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
929 ctdb_addr_to_str(&pip->addr)));
930 return 0;
932 vnn->pnn = pip->pnn;
934 /* stop any previous arps */
935 talloc_free(vnn->takeover_ctx);
936 vnn->takeover_ctx = NULL;
938 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
939 * lazy multicast to drop an IP from any node that isn't the
940 * intended new node. The following causes makes ctdbd ignore
941 * a release for any address it doesn't host.
943 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
944 if (!ctdb_sys_have_ip(&pip->addr)) {
945 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
946 ctdb_addr_to_str(&pip->addr),
947 vnn->public_netmask_bits,
948 ctdb_vnn_iface_string(vnn)));
949 ctdb_vnn_unassign_iface(ctdb, vnn);
950 return 0;
952 } else {
953 if (vnn->iface == NULL) {
954 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
955 ctdb_addr_to_str(&pip->addr),
956 vnn->public_netmask_bits));
957 return 0;
961 /* There is a potential race between take_ip and us because we
962 * update the VNN via a callback that run when the
963 * eventscripts have been run. Avoid the race by allowing one
964 * update to be in flight at a time.
966 if (vnn->update_in_flight) {
967 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
968 "update for this IP already in flight\n",
969 ctdb_addr_to_str(&vnn->public_address),
970 vnn->public_netmask_bits));
971 return -1;
974 iface = strdup(ctdb_vnn_iface_string(vnn));
976 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
977 ctdb_addr_to_str(&pip->addr),
978 vnn->public_netmask_bits,
979 iface,
980 pip->pnn));
982 state = talloc(ctdb, struct takeover_callback_state);
983 if (state == NULL) {
984 ctdb_set_error(ctdb, "Out of memory at %s:%d",
985 __FILE__, __LINE__);
986 free(iface);
987 return -1;
990 state->c = talloc_steal(state, c);
991 state->addr = talloc(state, ctdb_sock_addr);
992 if (state->addr == NULL) {
993 ctdb_set_error(ctdb, "Out of memory at %s:%d",
994 __FILE__, __LINE__);
995 free(iface);
996 talloc_free(state);
997 return -1;
999 *state->addr = pip->addr;
1000 state->vnn = vnn;
1002 vnn->update_in_flight = true;
1003 talloc_set_destructor(state, ctdb_releaseip_destructor);
1005 ret = ctdb_event_script_callback(ctdb,
1006 state, release_ip_callback, state,
1007 CTDB_EVENT_RELEASE_IP,
1008 "%s %s %u",
1009 iface,
1010 ctdb_addr_to_str(&pip->addr),
1011 vnn->public_netmask_bits);
1012 free(iface);
1013 if (ret != 0) {
1014 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1015 ctdb_addr_to_str(&pip->addr),
1016 ctdb_vnn_iface_string(vnn)));
1017 talloc_free(state);
1018 return -1;
1021 /* tell the control that we will be reply asynchronously */
1022 *async_reply = true;
1023 return 0;
1026 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1027 ctdb_sock_addr *addr,
1028 unsigned mask, const char *ifaces,
1029 bool check_address)
1031 struct ctdb_vnn *vnn;
1032 uint32_t num = 0;
1033 char *tmp;
1034 const char *iface;
1035 int i;
1036 int ret;
1038 tmp = strdup(ifaces);
1039 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1040 if (!ctdb_sys_check_iface_exists(iface)) {
1041 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1042 free(tmp);
1043 return -1;
1046 free(tmp);
1048 /* Verify that we dont have an entry for this ip yet */
1049 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1050 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1051 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1052 ctdb_addr_to_str(addr)));
1053 return -1;
1057 /* create a new vnn structure for this ip address */
1058 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1059 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1060 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1061 tmp = talloc_strdup(vnn, ifaces);
1062 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1063 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1064 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1065 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1066 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1067 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1068 num++;
1070 talloc_free(tmp);
1071 vnn->ifaces[num] = NULL;
1072 vnn->public_address = *addr;
1073 vnn->public_netmask_bits = mask;
1074 vnn->pnn = -1;
1075 if (check_address) {
1076 if (ctdb_sys_have_ip(addr)) {
1077 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1078 vnn->pnn = ctdb->pnn;
1082 for (i=0; vnn->ifaces[i]; i++) {
1083 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1084 if (ret != 0) {
1085 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1086 "for public_address[%s]\n",
1087 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1088 talloc_free(vnn);
1089 return -1;
1093 DLIST_ADD(ctdb->vnn, vnn);
1095 return 0;
1099 setup the public address lists from a file
1101 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1103 char **lines;
1104 int nlines;
1105 int i;
1107 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1108 if (lines == NULL) {
1109 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1110 return -1;
1112 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1113 nlines--;
1116 for (i=0;i<nlines;i++) {
1117 unsigned mask;
1118 ctdb_sock_addr addr;
1119 const char *addrstr;
1120 const char *ifaces;
1121 char *tok, *line;
1123 line = lines[i];
1124 while ((*line == ' ') || (*line == '\t')) {
1125 line++;
1127 if (*line == '#') {
1128 continue;
1130 if (strcmp(line, "") == 0) {
1131 continue;
1133 tok = strtok(line, " \t");
1134 addrstr = tok;
1135 tok = strtok(NULL, " \t");
1136 if (tok == NULL) {
1137 if (NULL == ctdb->default_public_interface) {
1138 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1139 i+1));
1140 talloc_free(lines);
1141 return -1;
1143 ifaces = ctdb->default_public_interface;
1144 } else {
1145 ifaces = tok;
1148 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1149 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1150 talloc_free(lines);
1151 return -1;
1153 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1154 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1155 talloc_free(lines);
1156 return -1;
1161 talloc_free(lines);
1162 return 0;
1165 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1166 const char *iface,
1167 const char *ip)
1169 struct ctdb_vnn *svnn;
1170 struct ctdb_iface *cur = NULL;
1171 bool ok;
1172 int ret;
1174 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1175 CTDB_NO_MEMORY(ctdb, svnn);
1177 svnn->ifaces = talloc_array(svnn, const char *, 2);
1178 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1179 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1180 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1181 svnn->ifaces[1] = NULL;
1183 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1184 if (!ok) {
1185 talloc_free(svnn);
1186 return -1;
1189 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1190 if (ret != 0) {
1191 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1192 "for single_ip[%s]\n",
1193 svnn->ifaces[0],
1194 ctdb_addr_to_str(&svnn->public_address)));
1195 talloc_free(svnn);
1196 return -1;
1199 /* assume the single public ip interface is initially "good" */
1200 cur = ctdb_find_iface(ctdb, iface);
1201 if (cur == NULL) {
1202 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1203 return -1;
1205 cur->link_up = true;
1207 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1208 if (ret != 0) {
1209 talloc_free(svnn);
1210 return -1;
1213 ctdb->single_ip_vnn = svnn;
1214 return 0;
1217 struct ctdb_public_ip_list {
1218 struct ctdb_public_ip_list *next;
1219 uint32_t pnn;
1220 ctdb_sock_addr addr;
1223 /* Given a physical node, return the number of
1224 public addresses that is currently assigned to this node.
1226 static int node_ip_coverage(struct ctdb_context *ctdb,
1227 int32_t pnn,
1228 struct ctdb_public_ip_list *ips)
1230 int num=0;
1232 for (;ips;ips=ips->next) {
1233 if (ips->pnn == pnn) {
1234 num++;
1237 return num;
1241 /* Can the given node host the given IP: is the public IP known to the
1242 * node and is NOIPHOST unset?
1244 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1245 struct ctdb_ipflags ipflags,
1246 struct ctdb_public_ip_list *ip)
1248 struct ctdb_all_public_ips *public_ips;
1249 int i;
1251 if (ipflags.noiphost) {
1252 return false;
1255 public_ips = ctdb->nodes[pnn]->available_public_ips;
1257 if (public_ips == NULL) {
1258 return false;
1261 for (i=0; i<public_ips->num; i++) {
1262 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1263 /* yes, this node can serve this public ip */
1264 return true;
1268 return false;
1271 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1272 struct ctdb_ipflags ipflags,
1273 struct ctdb_public_ip_list *ip)
1275 if (ipflags.noiptakeover) {
1276 return false;
1279 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1282 /* search the node lists list for a node to takeover this ip.
1283 pick the node that currently are serving the least number of ips
1284 so that the ips get spread out evenly.
1286 static int find_takeover_node(struct ctdb_context *ctdb,
1287 struct ctdb_ipflags *ipflags,
1288 struct ctdb_public_ip_list *ip,
1289 struct ctdb_public_ip_list *all_ips)
1291 int pnn, min=0, num;
1292 int i, numnodes;
1294 numnodes = talloc_array_length(ipflags);
1295 pnn = -1;
1296 for (i=0; i<numnodes; i++) {
1297 /* verify that this node can serve this ip */
1298 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1299 /* no it couldnt so skip to the next node */
1300 continue;
1303 num = node_ip_coverage(ctdb, i, all_ips);
1304 /* was this the first node we checked ? */
1305 if (pnn == -1) {
1306 pnn = i;
1307 min = num;
1308 } else {
1309 if (num < min) {
1310 pnn = i;
1311 min = num;
1315 if (pnn == -1) {
1316 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1317 ctdb_addr_to_str(&ip->addr)));
1319 return -1;
1322 ip->pnn = pnn;
1323 return 0;
1326 #define IP_KEYLEN 4
1327 static uint32_t *ip_key(ctdb_sock_addr *ip)
1329 static uint32_t key[IP_KEYLEN];
1331 bzero(key, sizeof(key));
1333 switch (ip->sa.sa_family) {
1334 case AF_INET:
1335 key[3] = htonl(ip->ip.sin_addr.s_addr);
1336 break;
1337 case AF_INET6: {
1338 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1339 key[0] = htonl(s6_a32[0]);
1340 key[1] = htonl(s6_a32[1]);
1341 key[2] = htonl(s6_a32[2]);
1342 key[3] = htonl(s6_a32[3]);
1343 break;
1345 default:
1346 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1347 return key;
1350 return key;
1353 static void *add_ip_callback(void *parm, void *data)
1355 struct ctdb_public_ip_list *this_ip = parm;
1356 struct ctdb_public_ip_list *prev_ip = data;
1358 if (prev_ip == NULL) {
1359 return parm;
1361 if (this_ip->pnn == -1) {
1362 this_ip->pnn = prev_ip->pnn;
1365 return parm;
1368 static int getips_count_callback(void *param, void *data)
1370 struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1371 struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1373 new_ip->next = *ip_list;
1374 *ip_list = new_ip;
1375 return 0;
1378 static struct ctdb_public_ip_list *
1379 create_merged_ip_list(struct ctdb_context *ctdb)
1381 int i, j;
1382 struct ctdb_public_ip_list *ip_list;
1383 struct ctdb_all_public_ips *public_ips;
1385 if (ctdb->ip_tree != NULL) {
1386 talloc_free(ctdb->ip_tree);
1387 ctdb->ip_tree = NULL;
1389 ctdb->ip_tree = trbt_create(ctdb, 0);
1391 for (i=0;i<ctdb->num_nodes;i++) {
1392 public_ips = ctdb->nodes[i]->known_public_ips;
1394 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1395 continue;
1398 /* there were no public ips for this node */
1399 if (public_ips == NULL) {
1400 continue;
1403 for (j=0;j<public_ips->num;j++) {
1404 struct ctdb_public_ip_list *tmp_ip;
1406 tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1407 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1408 /* Do not use information about IP addresses hosted
1409 * on other nodes, it may not be accurate */
1410 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1411 tmp_ip->pnn = public_ips->ips[j].pnn;
1412 } else {
1413 tmp_ip->pnn = -1;
1415 tmp_ip->addr = public_ips->ips[j].addr;
1416 tmp_ip->next = NULL;
1418 trbt_insertarray32_callback(ctdb->ip_tree,
1419 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1420 add_ip_callback,
1421 tmp_ip);
1425 ip_list = NULL;
1426 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1428 return ip_list;
1432 * This is the length of the longtest common prefix between the IPs.
1433 * It is calculated by XOR-ing the 2 IPs together and counting the
1434 * number of leading zeroes. The implementation means that all
1435 * addresses end up being 128 bits long.
1437 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1438 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1439 * lots of nodes and IP addresses?
1441 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1443 uint32_t ip1_k[IP_KEYLEN];
1444 uint32_t *t;
1445 int i;
1446 uint32_t x;
1448 uint32_t distance = 0;
1450 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1451 t = ip_key(ip2);
1452 for (i=0; i<IP_KEYLEN; i++) {
1453 x = ip1_k[i] ^ t[i];
1454 if (x == 0) {
1455 distance += 32;
1456 } else {
1457 /* Count number of leading zeroes.
1458 * FIXME? This could be optimised...
1460 while ((x & (1 << 31)) == 0) {
1461 x <<= 1;
1462 distance += 1;
1467 return distance;
1470 /* Calculate the IP distance for the given IP relative to IPs on the
1471 given node. The ips argument is generally the all_ips variable
1472 used in the main part of the algorithm.
1474 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1475 struct ctdb_public_ip_list *ips,
1476 int pnn)
1478 struct ctdb_public_ip_list *t;
1479 uint32_t d;
1481 uint32_t sum = 0;
1483 for (t=ips; t != NULL; t=t->next) {
1484 if (t->pnn != pnn) {
1485 continue;
1488 /* Optimisation: We never calculate the distance
1489 * between an address and itself. This allows us to
1490 * calculate the effect of removing an address from a
1491 * node by simply calculating the distance between
1492 * that address and all of the exitsing addresses.
1493 * Moreover, we assume that we're only ever dealing
1494 * with addresses from all_ips so we can identify an
1495 * address via a pointer rather than doing a more
1496 * expensive address comparison. */
1497 if (&(t->addr) == ip) {
1498 continue;
1501 d = ip_distance(ip, &(t->addr));
1502 sum += d * d; /* Cheaper than pulling in math.h :-) */
1505 return sum;
1508 /* Return the LCP2 imbalance metric for addresses currently assigned
1509 to the given node.
1511 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1513 struct ctdb_public_ip_list *t;
1515 uint32_t imbalance = 0;
1517 for (t=all_ips; t!=NULL; t=t->next) {
1518 if (t->pnn != pnn) {
1519 continue;
1521 /* Pass the rest of the IPs rather than the whole
1522 all_ips input list.
1524 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1527 return imbalance;
1530 /* Allocate any unassigned IPs just by looping through the IPs and
1531 * finding the best node for each.
1533 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1534 struct ctdb_ipflags *ipflags,
1535 struct ctdb_public_ip_list *all_ips)
1537 struct ctdb_public_ip_list *tmp_ip;
1539 /* loop over all ip's and find a physical node to cover for
1540 each unassigned ip.
1542 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1543 if (tmp_ip->pnn == -1) {
1544 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1545 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1546 ctdb_addr_to_str(&tmp_ip->addr)));
1552 /* Basic non-deterministic rebalancing algorithm.
1554 static void basic_failback(struct ctdb_context *ctdb,
1555 struct ctdb_ipflags *ipflags,
1556 struct ctdb_public_ip_list *all_ips,
1557 int num_ips)
1559 int i, numnodes;
1560 int maxnode, maxnum, minnode, minnum, num, retries;
1561 struct ctdb_public_ip_list *tmp_ip;
1563 numnodes = talloc_array_length(ipflags);
1564 retries = 0;
1566 try_again:
1567 maxnum=0;
1568 minnum=0;
1570 /* for each ip address, loop over all nodes that can serve
1571 this ip and make sure that the difference between the node
1572 serving the most and the node serving the least ip's are
1573 not greater than 1.
1575 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1576 if (tmp_ip->pnn == -1) {
1577 continue;
1580 /* Get the highest and lowest number of ips's served by any
1581 valid node which can serve this ip.
1583 maxnode = -1;
1584 minnode = -1;
1585 for (i=0; i<numnodes; i++) {
1586 /* only check nodes that can actually serve this ip */
1587 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1588 /* no it couldnt so skip to the next node */
1589 continue;
1592 num = node_ip_coverage(ctdb, i, all_ips);
1593 if (maxnode == -1) {
1594 maxnode = i;
1595 maxnum = num;
1596 } else {
1597 if (num > maxnum) {
1598 maxnode = i;
1599 maxnum = num;
1602 if (minnode == -1) {
1603 minnode = i;
1604 minnum = num;
1605 } else {
1606 if (num < minnum) {
1607 minnode = i;
1608 minnum = num;
1612 if (maxnode == -1) {
1613 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1614 ctdb_addr_to_str(&tmp_ip->addr)));
1616 continue;
1619 /* if the spread between the smallest and largest coverage by
1620 a node is >=2 we steal one of the ips from the node with
1621 most coverage to even things out a bit.
1622 try to do this a limited number of times since we dont
1623 want to spend too much time balancing the ip coverage.
1625 if ( (maxnum > minnum+1)
1626 && (retries < (num_ips + 5)) ){
1627 struct ctdb_public_ip_list *tmp;
1629 /* Reassign one of maxnode's VNNs */
1630 for (tmp=all_ips;tmp;tmp=tmp->next) {
1631 if (tmp->pnn == maxnode) {
1632 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1633 retries++;
1634 goto try_again;;
1641 static void lcp2_init(struct ctdb_context *tmp_ctx,
1642 struct ctdb_ipflags *ipflags,
1643 struct ctdb_public_ip_list *all_ips,
1644 uint32_t *force_rebalance_nodes,
1645 uint32_t **lcp2_imbalances,
1646 bool **rebalance_candidates)
1648 int i, numnodes;
1649 struct ctdb_public_ip_list *tmp_ip;
1651 numnodes = talloc_array_length(ipflags);
1653 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1654 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1655 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1656 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1658 for (i=0; i<numnodes; i++) {
1659 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1660 /* First step: assume all nodes are candidates */
1661 (*rebalance_candidates)[i] = true;
1664 /* 2nd step: if a node has IPs assigned then it must have been
1665 * healthy before, so we remove it from consideration. This
1666 * is overkill but is all we have because we don't maintain
1667 * state between takeover runs. An alternative would be to
1668 * keep state and invalidate it every time the recovery master
1669 * changes.
1671 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1672 if (tmp_ip->pnn != -1) {
1673 (*rebalance_candidates)[tmp_ip->pnn] = false;
1677 /* 3rd step: if a node is forced to re-balance then
1678 we allow failback onto the node */
1679 if (force_rebalance_nodes == NULL) {
1680 return;
1682 for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
1683 uint32_t pnn = force_rebalance_nodes[i];
1684 if (pnn >= numnodes) {
1685 DEBUG(DEBUG_ERR,
1686 (__location__ "unknown node %u\n", pnn));
1687 continue;
1690 DEBUG(DEBUG_NOTICE,
1691 ("Forcing rebalancing of IPs to node %u\n", pnn));
1692 (*rebalance_candidates)[pnn] = true;
1696 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1697 * the IP/node combination that will cost the least.
1699 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1700 struct ctdb_ipflags *ipflags,
1701 struct ctdb_public_ip_list *all_ips,
1702 uint32_t *lcp2_imbalances)
1704 struct ctdb_public_ip_list *tmp_ip;
1705 int dstnode, numnodes;
1707 int minnode;
1708 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1709 struct ctdb_public_ip_list *minip;
1711 bool should_loop = true;
1712 bool have_unassigned = true;
1714 numnodes = talloc_array_length(ipflags);
1716 while (have_unassigned && should_loop) {
1717 should_loop = false;
1719 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1720 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1722 minnode = -1;
1723 mindsum = 0;
1724 minip = NULL;
1726 /* loop over each unassigned ip. */
1727 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1728 if (tmp_ip->pnn != -1) {
1729 continue;
1732 for (dstnode=0; dstnode<numnodes; dstnode++) {
1733 /* only check nodes that can actually takeover this ip */
1734 if (!can_node_takeover_ip(ctdb, dstnode,
1735 ipflags[dstnode],
1736 tmp_ip)) {
1737 /* no it couldnt so skip to the next node */
1738 continue;
1741 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1742 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1743 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1744 ctdb_addr_to_str(&(tmp_ip->addr)),
1745 dstnode,
1746 dstimbl - lcp2_imbalances[dstnode]));
1749 if ((minnode == -1) || (dstdsum < mindsum)) {
1750 minnode = dstnode;
1751 minimbl = dstimbl;
1752 mindsum = dstdsum;
1753 minip = tmp_ip;
1754 should_loop = true;
1759 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1761 /* If we found one then assign it to the given node. */
1762 if (minnode != -1) {
1763 minip->pnn = minnode;
1764 lcp2_imbalances[minnode] = minimbl;
1765 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1766 ctdb_addr_to_str(&(minip->addr)),
1767 minnode,
1768 mindsum));
1771 /* There might be a better way but at least this is clear. */
1772 have_unassigned = false;
1773 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1774 if (tmp_ip->pnn == -1) {
1775 have_unassigned = true;
1780 /* We know if we have an unassigned addresses so we might as
1781 * well optimise.
1783 if (have_unassigned) {
1784 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1785 if (tmp_ip->pnn == -1) {
1786 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1787 ctdb_addr_to_str(&tmp_ip->addr)));
1793 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1794 * to move IPs from, determines the best IP/destination node
1795 * combination to move from the source node.
1797 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1798 struct ctdb_ipflags *ipflags,
1799 struct ctdb_public_ip_list *all_ips,
1800 int srcnode,
1801 uint32_t *lcp2_imbalances,
1802 bool *rebalance_candidates)
1804 int dstnode, mindstnode, numnodes;
1805 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1806 uint32_t minsrcimbl, mindstimbl;
1807 struct ctdb_public_ip_list *minip;
1808 struct ctdb_public_ip_list *tmp_ip;
1810 /* Find an IP and destination node that best reduces imbalance. */
1811 srcimbl = 0;
1812 minip = NULL;
1813 minsrcimbl = 0;
1814 mindstnode = -1;
1815 mindstimbl = 0;
1817 numnodes = talloc_array_length(ipflags);
1819 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1820 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
1821 srcnode, lcp2_imbalances[srcnode]));
1823 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1824 /* Only consider addresses on srcnode. */
1825 if (tmp_ip->pnn != srcnode) {
1826 continue;
1829 /* What is this IP address costing the source node? */
1830 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1831 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
1833 /* Consider this IP address would cost each potential
1834 * destination node. Destination nodes are limited to
1835 * those that are newly healthy, since we don't want
1836 * to do gratuitous failover of IPs just to make minor
1837 * balance improvements.
1839 for (dstnode=0; dstnode<numnodes; dstnode++) {
1840 if (!rebalance_candidates[dstnode]) {
1841 continue;
1844 /* only check nodes that can actually takeover this ip */
1845 if (!can_node_takeover_ip(ctdb, dstnode,
1846 ipflags[dstnode], tmp_ip)) {
1847 /* no it couldnt so skip to the next node */
1848 continue;
1851 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1852 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1853 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1854 srcnode, -srcdsum,
1855 ctdb_addr_to_str(&(tmp_ip->addr)),
1856 dstnode, dstdsum));
1858 if ((dstimbl < lcp2_imbalances[srcnode]) &&
1859 (dstdsum < srcdsum) && \
1860 ((mindstnode == -1) || \
1861 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1863 minip = tmp_ip;
1864 minsrcimbl = srcimbl;
1865 mindstnode = dstnode;
1866 mindstimbl = dstimbl;
1870 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1872 if (mindstnode != -1) {
1873 /* We found a move that makes things better... */
1874 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1875 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1876 ctdb_addr_to_str(&(minip->addr)),
1877 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1880 lcp2_imbalances[srcnode] = minsrcimbl;
1881 lcp2_imbalances[mindstnode] = mindstimbl;
1882 minip->pnn = mindstnode;
1884 return true;
1887 return false;
1891 struct lcp2_imbalance_pnn {
1892 uint32_t imbalance;
1893 int pnn;
1896 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1898 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1899 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1901 if (lipa->imbalance > lipb->imbalance) {
1902 return -1;
1903 } else if (lipa->imbalance == lipb->imbalance) {
1904 return 0;
1905 } else {
1906 return 1;
1910 /* LCP2 algorithm for rebalancing the cluster. This finds the source
1911 * node with the highest LCP2 imbalance, and then determines the best
1912 * IP/destination node combination to move from the source node.
1914 static void lcp2_failback(struct ctdb_context *ctdb,
1915 struct ctdb_ipflags *ipflags,
1916 struct ctdb_public_ip_list *all_ips,
1917 uint32_t *lcp2_imbalances,
1918 bool *rebalance_candidates)
1920 int i, numnodes;
1921 struct lcp2_imbalance_pnn * lips;
1922 bool again;
1924 numnodes = talloc_array_length(ipflags);
1926 try_again:
1927 /* Put the imbalances and nodes into an array, sort them and
1928 * iterate through candidates. Usually the 1st one will be
1929 * used, so this doesn't cost much...
1931 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
1932 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
1933 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
1934 for (i=0; i<numnodes; i++) {
1935 lips[i].imbalance = lcp2_imbalances[i];
1936 lips[i].pnn = i;
1937 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
1939 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
1940 lcp2_cmp_imbalance_pnn);
1942 again = false;
1943 for (i=0; i<numnodes; i++) {
1944 /* This means that all nodes had 0 or 1 addresses, so
1945 * can't be imbalanced.
1947 if (lips[i].imbalance == 0) {
1948 break;
1951 if (lcp2_failback_candidate(ctdb,
1952 ipflags,
1953 all_ips,
1954 lips[i].pnn,
1955 lcp2_imbalances,
1956 rebalance_candidates)) {
1957 again = true;
1958 break;
1962 talloc_free(lips);
1963 if (again) {
1964 goto try_again;
1968 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
1969 struct ctdb_ipflags *ipflags,
1970 struct ctdb_public_ip_list *all_ips)
1972 struct ctdb_public_ip_list *tmp_ip;
1974 /* verify that the assigned nodes can serve that public ip
1975 and set it to -1 if not
1977 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1978 if (tmp_ip->pnn == -1) {
1979 continue;
1981 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
1982 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
1983 /* this node can not serve this ip. */
1984 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
1985 ctdb_addr_to_str(&(tmp_ip->addr)),
1986 tmp_ip->pnn));
1987 tmp_ip->pnn = -1;
1992 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
1993 struct ctdb_ipflags *ipflags,
1994 struct ctdb_public_ip_list *all_ips)
1996 struct ctdb_public_ip_list *tmp_ip;
1997 int i, numnodes;
1999 numnodes = talloc_array_length(ipflags);
2001 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2002 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2003 * always be allocated the same way for a specific set of
2004 * available/unavailable nodes.
2007 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2008 tmp_ip->pnn = i % numnodes;
2011 /* IP failback doesn't make sense with deterministic
2012 * IPs, since the modulo step above implicitly fails
2013 * back IPs to their "home" node.
2015 if (1 == ctdb->tunable.no_ip_failback) {
2016 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2019 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2021 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2023 /* No failback here! */
2026 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2027 struct ctdb_ipflags *ipflags,
2028 struct ctdb_public_ip_list *all_ips)
2030 /* This should be pushed down into basic_failback. */
2031 struct ctdb_public_ip_list *tmp_ip;
2032 int num_ips = 0;
2033 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2034 num_ips++;
2037 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2039 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2041 /* If we don't want IPs to fail back then don't rebalance IPs. */
2042 if (1 == ctdb->tunable.no_ip_failback) {
2043 return;
2046 /* Now, try to make sure the ip adresses are evenly distributed
2047 across the nodes.
2049 basic_failback(ctdb, ipflags, all_ips, num_ips);
2052 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2053 struct ctdb_ipflags *ipflags,
2054 struct ctdb_public_ip_list *all_ips,
2055 uint32_t *force_rebalance_nodes)
2057 uint32_t *lcp2_imbalances;
2058 bool *rebalance_candidates;
2059 int numnodes, num_rebalance_candidates, i;
2061 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2063 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2065 lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
2066 &lcp2_imbalances, &rebalance_candidates);
2068 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2070 /* If we don't want IPs to fail back then don't rebalance IPs. */
2071 if (1 == ctdb->tunable.no_ip_failback) {
2072 goto finished;
2075 /* It is only worth continuing if we have suitable target
2076 * nodes to transfer IPs to. This check is much cheaper than
2077 * continuing on...
2079 numnodes = talloc_array_length(ipflags);
2080 num_rebalance_candidates = 0;
2081 for (i=0; i<numnodes; i++) {
2082 if (rebalance_candidates[i]) {
2083 num_rebalance_candidates++;
2086 if (num_rebalance_candidates == 0) {
2087 goto finished;
2090 /* Now, try to make sure the ip adresses are evenly distributed
2091 across the nodes.
2093 lcp2_failback(ctdb, ipflags, all_ips,
2094 lcp2_imbalances, rebalance_candidates);
2096 finished:
2097 talloc_free(tmp_ctx);
2100 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2102 int i;
2104 for (i=0;i<nodemap->num;i++) {
2105 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2106 /* Found one completely healthy node */
2107 return false;
2111 return true;
2114 /* The calculation part of the IP allocation algorithm. */
2115 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2116 struct ctdb_ipflags *ipflags,
2117 struct ctdb_public_ip_list **all_ips_p,
2118 uint32_t *force_rebalance_nodes)
2120 /* since nodes only know about those public addresses that
2121 can be served by that particular node, no single node has
2122 a full list of all public addresses that exist in the cluster.
2123 Walk over all node structures and create a merged list of
2124 all public addresses that exist in the cluster.
2126 keep the tree of ips around as ctdb->ip_tree
2128 *all_ips_p = create_merged_ip_list(ctdb);
2130 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2131 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
2132 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2133 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2134 } else {
2135 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2138 /* at this point ->pnn is the node which will own each IP
2139 or -1 if there is no node that can cover this ip
2142 return;
2145 struct get_tunable_callback_data {
2146 const char *tunable;
2147 uint32_t *out;
2148 bool fatal;
2151 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2152 int32_t res, TDB_DATA outdata,
2153 void *callback)
2155 struct get_tunable_callback_data *cd =
2156 (struct get_tunable_callback_data *)callback;
2157 int size;
2159 if (res != 0) {
2160 /* Already handled in fail callback */
2161 return;
2164 if (outdata.dsize != sizeof(uint32_t)) {
2165 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2166 cd->tunable, pnn, (int)sizeof(uint32_t),
2167 (int)outdata.dsize));
2168 cd->fatal = true;
2169 return;
2172 size = talloc_array_length(cd->out);
2173 if (pnn >= size) {
2174 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2175 cd->tunable, pnn, size));
2176 return;
2180 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2183 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2184 int32_t res, TDB_DATA outdata,
2185 void *callback)
2187 struct get_tunable_callback_data *cd =
2188 (struct get_tunable_callback_data *)callback;
2190 switch (res) {
2191 case -ETIME:
2192 DEBUG(DEBUG_ERR,
2193 ("Timed out getting tunable \"%s\" from node %d\n",
2194 cd->tunable, pnn));
2195 cd->fatal = true;
2196 break;
2197 case -EINVAL:
2198 case -1:
2199 DEBUG(DEBUG_WARNING,
2200 ("Tunable \"%s\" not implemented on node %d\n",
2201 cd->tunable, pnn));
2202 break;
2203 default:
2204 DEBUG(DEBUG_ERR,
2205 ("Unexpected error getting tunable \"%s\" from node %d\n",
2206 cd->tunable, pnn));
2207 cd->fatal = true;
2211 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2212 TALLOC_CTX *tmp_ctx,
2213 struct ctdb_node_map *nodemap,
2214 const char *tunable,
2215 uint32_t default_value)
2217 TDB_DATA data;
2218 struct ctdb_control_get_tunable *t;
2219 uint32_t *nodes;
2220 uint32_t *tvals;
2221 struct get_tunable_callback_data callback_data;
2222 int i;
2224 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2225 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2226 for (i=0; i<nodemap->num; i++) {
2227 tvals[i] = default_value;
2230 callback_data.out = tvals;
2231 callback_data.tunable = tunable;
2232 callback_data.fatal = false;
2234 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2235 data.dptr = talloc_size(tmp_ctx, data.dsize);
2236 t = (struct ctdb_control_get_tunable *)data.dptr;
2237 t->length = strlen(tunable)+1;
2238 memcpy(t->name, tunable, t->length);
2239 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2240 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2241 nodes, 0, TAKEOVER_TIMEOUT(),
2242 false, data,
2243 get_tunable_callback,
2244 get_tunable_fail_callback,
2245 &callback_data) != 0) {
2246 if (callback_data.fatal) {
2247 talloc_free(tvals);
2248 tvals = NULL;
2251 talloc_free(nodes);
2252 talloc_free(data.dptr);
2254 return tvals;
2257 struct get_runstate_callback_data {
2258 enum ctdb_runstate *out;
2259 bool fatal;
2262 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2263 int32_t res, TDB_DATA outdata,
2264 void *callback_data)
2266 struct get_runstate_callback_data *cd =
2267 (struct get_runstate_callback_data *)callback_data;
2268 int size;
2270 if (res != 0) {
2271 /* Already handled in fail callback */
2272 return;
2275 if (outdata.dsize != sizeof(uint32_t)) {
2276 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2277 pnn, (int)sizeof(uint32_t),
2278 (int)outdata.dsize));
2279 cd->fatal = true;
2280 return;
2283 size = talloc_array_length(cd->out);
2284 if (pnn >= size) {
2285 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2286 pnn, size));
2287 return;
2290 cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2293 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2294 int32_t res, TDB_DATA outdata,
2295 void *callback)
2297 struct get_runstate_callback_data *cd =
2298 (struct get_runstate_callback_data *)callback;
2300 switch (res) {
2301 case -ETIME:
2302 DEBUG(DEBUG_ERR,
2303 ("Timed out getting runstate from node %d\n", pnn));
2304 cd->fatal = true;
2305 break;
2306 default:
2307 DEBUG(DEBUG_WARNING,
2308 ("Error getting runstate from node %d - assuming runstates not supported\n",
2309 pnn));
2313 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2314 TALLOC_CTX *tmp_ctx,
2315 struct ctdb_node_map *nodemap,
2316 enum ctdb_runstate default_value)
2318 uint32_t *nodes;
2319 enum ctdb_runstate *rs;
2320 struct get_runstate_callback_data callback_data;
2321 int i;
2323 rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2324 CTDB_NO_MEMORY_NULL(ctdb, rs);
2325 for (i=0; i<nodemap->num; i++) {
2326 rs[i] = default_value;
2329 callback_data.out = rs;
2330 callback_data.fatal = false;
2332 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2333 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2334 nodes, 0, TAKEOVER_TIMEOUT(),
2335 true, tdb_null,
2336 get_runstate_callback,
2337 get_runstate_fail_callback,
2338 &callback_data) != 0) {
2339 if (callback_data.fatal) {
2340 free(rs);
2341 rs = NULL;
2344 talloc_free(nodes);
2346 return rs;
2349 /* Set internal flags for IP allocation:
2350 * Clear ip flags
2351 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2352 * Set NOIPHOST ip flag for each INACTIVE node
2353 * if all nodes are disabled:
2354 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2355 * else
2356 * Set NOIPHOST ip flags for disabled nodes
2358 static struct ctdb_ipflags *
2359 set_ipflags_internal(struct ctdb_context *ctdb,
2360 TALLOC_CTX *tmp_ctx,
2361 struct ctdb_node_map *nodemap,
2362 uint32_t *tval_noiptakeover,
2363 uint32_t *tval_noiphostonalldisabled,
2364 enum ctdb_runstate *runstate)
2366 int i;
2367 struct ctdb_ipflags *ipflags;
2369 /* Clear IP flags - implicit due to talloc_zero */
2370 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2371 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2373 for (i=0;i<nodemap->num;i++) {
2374 /* Can not take IPs on node with NoIPTakeover set */
2375 if (tval_noiptakeover[i] != 0) {
2376 ipflags[i].noiptakeover = true;
2379 /* Can not host IPs on node not in RUNNING state */
2380 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2381 ipflags[i].noiphost = true;
2382 continue;
2384 /* Can not host IPs on INACTIVE node */
2385 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2386 ipflags[i].noiphost = true;
2388 /* Remember the runstate */
2389 ipflags[i].runstate = runstate[i];
2392 if (all_nodes_are_disabled(nodemap)) {
2393 /* If all nodes are disabled, can not host IPs on node
2394 * with NoIPHostOnAllDisabled set
2396 for (i=0;i<nodemap->num;i++) {
2397 if (tval_noiphostonalldisabled[i] != 0) {
2398 ipflags[i].noiphost = true;
2401 } else {
2402 /* If some nodes are not disabled, then can not host
2403 * IPs on DISABLED node
2405 for (i=0;i<nodemap->num;i++) {
2406 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2407 ipflags[i].noiphost = true;
2412 return ipflags;
2415 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2416 TALLOC_CTX *tmp_ctx,
2417 struct ctdb_node_map *nodemap)
2419 uint32_t *tval_noiptakeover;
2420 uint32_t *tval_noiphostonalldisabled;
2421 struct ctdb_ipflags *ipflags;
2422 enum ctdb_runstate *runstate;
2425 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2426 "NoIPTakeover", 0);
2427 if (tval_noiptakeover == NULL) {
2428 return NULL;
2431 tval_noiphostonalldisabled =
2432 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2433 "NoIPHostOnAllDisabled", 0);
2434 if (tval_noiphostonalldisabled == NULL) {
2435 /* Caller frees tmp_ctx */
2436 return NULL;
2439 /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2440 * will default to CTDB_RUNSTATE_RUNNING. This ensures
2441 * reasonable behaviour on a mixed cluster during upgrade.
2443 runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2444 CTDB_RUNSTATE_RUNNING);
2445 if (runstate == NULL) {
2446 /* Caller frees tmp_ctx */
2447 return NULL;
2450 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2451 tval_noiptakeover,
2452 tval_noiphostonalldisabled,
2453 runstate);
2455 talloc_free(tval_noiptakeover);
2456 talloc_free(tval_noiphostonalldisabled);
2457 talloc_free(runstate);
2459 return ipflags;
2462 struct iprealloc_callback_data {
2463 bool *retry_nodes;
2464 int retry_count;
2465 client_async_callback fail_callback;
2466 void *fail_callback_data;
2467 struct ctdb_node_map *nodemap;
2470 static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2471 int32_t res, TDB_DATA outdata,
2472 void *callback)
2474 int numnodes;
2475 struct iprealloc_callback_data *cd =
2476 (struct iprealloc_callback_data *)callback;
2478 numnodes = talloc_array_length(cd->retry_nodes);
2479 if (pnn > numnodes) {
2480 DEBUG(DEBUG_ERR,
2481 ("ipreallocated failure from node %d, "
2482 "but only %d nodes in nodemap\n",
2483 pnn, numnodes));
2484 return;
2487 /* Can't run the "ipreallocated" event on a INACTIVE node */
2488 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
2489 DEBUG(DEBUG_WARNING,
2490 ("ipreallocated failed on inactive node %d, ignoring\n",
2491 pnn));
2492 return;
2495 switch (res) {
2496 case -ETIME:
2497 /* If the control timed out then that's a real error,
2498 * so call the real fail callback
2500 if (cd->fail_callback) {
2501 cd->fail_callback(ctdb, pnn, res, outdata,
2502 cd->fail_callback_data);
2503 } else {
2504 DEBUG(DEBUG_WARNING,
2505 ("iprealloc timed out but no callback registered\n"));
2507 break;
2508 default:
2509 /* If not a timeout then either the ipreallocated
2510 * eventscript (or some setup) failed. This might
2511 * have failed because the IPREALLOCATED control isn't
2512 * implemented - right now there is no way of knowing
2513 * because the error codes are all folded down to -1.
2514 * Consider retrying using EVENTSCRIPT control...
2516 DEBUG(DEBUG_WARNING,
2517 ("ipreallocated failure from node %d, flagging retry\n",
2518 pnn));
2519 cd->retry_nodes[pnn] = true;
2520 cd->retry_count++;
2524 struct takeover_callback_data {
2525 bool *node_failed;
2526 client_async_callback fail_callback;
2527 void *fail_callback_data;
2528 struct ctdb_node_map *nodemap;
2531 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
2532 uint32_t node_pnn, int32_t res,
2533 TDB_DATA outdata, void *callback_data)
2535 struct takeover_callback_data *cd =
2536 talloc_get_type_abort(callback_data,
2537 struct takeover_callback_data);
2538 int i;
2540 for (i = 0; i < cd->nodemap->num; i++) {
2541 if (node_pnn == cd->nodemap->nodes[i].pnn) {
2542 break;
2546 if (i == cd->nodemap->num) {
2547 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
2548 return;
2551 if (!cd->node_failed[i]) {
2552 cd->node_failed[i] = true;
2553 cd->fail_callback(ctdb, node_pnn, res, outdata,
2554 cd->fail_callback_data);
2559 make any IP alias changes for public addresses that are necessary
2561 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2562 uint32_t *force_rebalance_nodes,
2563 client_async_callback fail_callback, void *callback_data)
2565 int i, j, ret;
2566 struct ctdb_public_ip ip;
2567 uint32_t *nodes;
2568 struct ctdb_public_ip_list *all_ips, *tmp_ip;
2569 TDB_DATA data;
2570 struct timeval timeout;
2571 struct client_async_data *async_data;
2572 struct ctdb_client_control_state *state;
2573 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2574 struct ctdb_ipflags *ipflags;
2575 struct takeover_callback_data *takeover_data;
2576 struct iprealloc_callback_data iprealloc_data;
2577 bool *retry_data;
2578 bool can_host_ips;
2581 * ip failover is completely disabled, just send out the
2582 * ipreallocated event.
2584 if (ctdb->tunable.disable_ip_failover != 0) {
2585 goto ipreallocated;
2588 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2589 if (ipflags == NULL) {
2590 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2591 talloc_free(tmp_ctx);
2592 return -1;
2595 /* Short-circuit IP allocation if no nodes are in the RUNNING
2596 * runstate yet, since no nodes will be able to host IPs */
2597 can_host_ips = false;
2598 for (i=0; i<nodemap->num; i++) {
2599 if (ipflags[i].runstate == CTDB_RUNSTATE_RUNNING) {
2600 can_host_ips = true;
2603 if (!can_host_ips) {
2604 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
2605 return 0;
2608 /* Do the IP reassignment calculations */
2609 ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
2611 /* Now tell all nodes to release any public IPs should not
2612 * host. This will be a NOOP on nodes that don't currently
2613 * hold the given IP.
2615 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
2616 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
2618 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
2619 bool, nodemap->num);
2620 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
2621 takeover_data->fail_callback = fail_callback;
2622 takeover_data->fail_callback_data = callback_data;
2623 takeover_data->nodemap = nodemap;
2625 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2626 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2628 async_data->fail_callback = takeover_run_fail_callback;
2629 async_data->callback_data = takeover_data;
2631 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
2633 /* Send a RELEASE_IP to all nodes that should not be hosting
2634 * each IP. For each IP, all but one of these will be
2635 * redundant. However, the redundant ones are used to tell
2636 * nodes which node should be hosting the IP so that commands
2637 * like "ctdb ip" can display a particular nodes idea of who
2638 * is hosting what. */
2639 for (i=0;i<nodemap->num;i++) {
2640 /* don't talk to unconnected nodes, but do talk to banned nodes */
2641 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2642 continue;
2645 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2646 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2647 /* This node should be serving this
2648 vnn so dont tell it to release the ip
2650 continue;
2652 ip.pnn = tmp_ip->pnn;
2653 ip.addr = tmp_ip->addr;
2655 timeout = TAKEOVER_TIMEOUT();
2656 data.dsize = sizeof(ip);
2657 data.dptr = (uint8_t *)&ip;
2658 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2659 0, CTDB_CONTROL_RELEASE_IP, 0,
2660 data, async_data,
2661 &timeout, NULL);
2662 if (state == NULL) {
2663 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2664 talloc_free(tmp_ctx);
2665 return -1;
2668 ctdb_client_async_add(async_data, state);
2671 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2672 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2673 talloc_free(tmp_ctx);
2674 return -1;
2676 talloc_free(async_data);
2679 /* For each IP, send a TAKOVER_IP to the node that should be
2680 * hosting it. Many of these will often be redundant (since
2681 * the allocation won't have changed) but they can be useful
2682 * to recover from inconsistencies. */
2683 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2684 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2686 async_data->fail_callback = fail_callback;
2687 async_data->callback_data = callback_data;
2689 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2690 if (tmp_ip->pnn == -1) {
2691 /* this IP won't be taken over */
2692 continue;
2695 ip.pnn = tmp_ip->pnn;
2696 ip.addr = tmp_ip->addr;
2698 timeout = TAKEOVER_TIMEOUT();
2699 data.dsize = sizeof(ip);
2700 data.dptr = (uint8_t *)&ip;
2701 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2702 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2703 data, async_data, &timeout, NULL);
2704 if (state == NULL) {
2705 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2706 talloc_free(tmp_ctx);
2707 return -1;
2710 ctdb_client_async_add(async_data, state);
2712 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2713 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2714 talloc_free(tmp_ctx);
2715 return -1;
2718 ipreallocated:
2720 * Tell all nodes to run eventscripts to process the
2721 * "ipreallocated" event. This can do a lot of things,
2722 * including restarting services to reconfigure them if public
2723 * IPs have moved. Once upon a time this event only used to
2724 * update natgw.
2726 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
2727 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
2728 iprealloc_data.retry_nodes = retry_data;
2729 iprealloc_data.retry_count = 0;
2730 iprealloc_data.fail_callback = fail_callback;
2731 iprealloc_data.fail_callback_data = callback_data;
2732 iprealloc_data.nodemap = nodemap;
2734 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2735 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2736 nodes, 0, TAKEOVER_TIMEOUT(),
2737 false, tdb_null,
2738 NULL, iprealloc_fail_callback,
2739 &iprealloc_data);
2740 if (ret != 0) {
2741 /* If the control failed then we should retry to any
2742 * nodes flagged by iprealloc_fail_callback using the
2743 * EVENTSCRIPT control. This is a best-effort at
2744 * backward compatiblity when running a mixed cluster
2745 * where some nodes have not yet been upgraded to
2746 * support the IPREALLOCATED control.
2748 DEBUG(DEBUG_WARNING,
2749 ("Retry ipreallocated to some nodes using eventscript control\n"));
2751 nodes = talloc_array(tmp_ctx, uint32_t,
2752 iprealloc_data.retry_count);
2753 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
2755 j = 0;
2756 for (i=0; i<nodemap->num; i++) {
2757 if (iprealloc_data.retry_nodes[i]) {
2758 nodes[j] = i;
2759 j++;
2763 data.dptr = discard_const("ipreallocated");
2764 data.dsize = strlen((char *)data.dptr) + 1;
2765 ret = ctdb_client_async_control(ctdb,
2766 CTDB_CONTROL_RUN_EVENTSCRIPTS,
2767 nodes, 0, TAKEOVER_TIMEOUT(),
2768 false, data,
2769 NULL, fail_callback,
2770 callback_data);
2771 if (ret != 0) {
2772 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2776 talloc_free(tmp_ctx);
2777 return ret;
2782 destroy a ctdb_client_ip structure
2784 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2786 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2787 ctdb_addr_to_str(&ip->addr),
2788 ntohs(ip->addr.ip.sin_port),
2789 ip->client_id));
2791 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2792 return 0;
2796 called by a client to inform us of a TCP connection that it is managing
2797 that should tickled with an ACK when IP takeover is done
2799 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2800 TDB_DATA indata)
2802 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2803 struct ctdb_control_tcp_addr *tcp_sock = NULL;
2804 struct ctdb_tcp_list *tcp;
2805 struct ctdb_tcp_connection t;
2806 int ret;
2807 TDB_DATA data;
2808 struct ctdb_client_ip *ip;
2809 struct ctdb_vnn *vnn;
2810 ctdb_sock_addr addr;
2812 /* If we don't have public IPs, tickles are useless */
2813 if (ctdb->vnn == NULL) {
2814 return 0;
2817 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2819 addr = tcp_sock->src;
2820 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2821 addr = tcp_sock->dest;
2822 ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2824 ZERO_STRUCT(addr);
2825 memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2826 vnn = find_public_ip_vnn(ctdb, &addr);
2827 if (vnn == NULL) {
2828 switch (addr.sa.sa_family) {
2829 case AF_INET:
2830 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2831 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2832 ctdb_addr_to_str(&addr)));
2834 break;
2835 case AF_INET6:
2836 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2837 ctdb_addr_to_str(&addr)));
2838 break;
2839 default:
2840 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2843 return 0;
2846 if (vnn->pnn != ctdb->pnn) {
2847 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2848 ctdb_addr_to_str(&addr),
2849 client_id, client->pid));
2850 /* failing this call will tell smbd to die */
2851 return -1;
2854 ip = talloc(client, struct ctdb_client_ip);
2855 CTDB_NO_MEMORY(ctdb, ip);
2857 ip->ctdb = ctdb;
2858 ip->addr = addr;
2859 ip->client_id = client_id;
2860 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2861 DLIST_ADD(ctdb->client_ip_list, ip);
2863 tcp = talloc(client, struct ctdb_tcp_list);
2864 CTDB_NO_MEMORY(ctdb, tcp);
2866 tcp->connection.src_addr = tcp_sock->src;
2867 tcp->connection.dst_addr = tcp_sock->dest;
2869 DLIST_ADD(client->tcp_list, tcp);
2871 t.src_addr = tcp_sock->src;
2872 t.dst_addr = tcp_sock->dest;
2874 data.dptr = (uint8_t *)&t;
2875 data.dsize = sizeof(t);
2877 switch (addr.sa.sa_family) {
2878 case AF_INET:
2879 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2880 (unsigned)ntohs(tcp_sock->dest.ip.sin_port),
2881 ctdb_addr_to_str(&tcp_sock->src),
2882 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2883 break;
2884 case AF_INET6:
2885 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2886 (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port),
2887 ctdb_addr_to_str(&tcp_sock->src),
2888 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2889 break;
2890 default:
2891 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2895 /* tell all nodes about this tcp connection */
2896 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2897 CTDB_CONTROL_TCP_ADD,
2898 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2899 if (ret != 0) {
2900 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2901 return -1;
2904 return 0;
2908 find a tcp address on a list
2910 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2911 struct ctdb_tcp_connection *tcp)
2913 int i;
2915 if (array == NULL) {
2916 return NULL;
2919 for (i=0;i<array->num;i++) {
2920 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2921 ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2922 return &array->connections[i];
2925 return NULL;
2931 called by a daemon to inform us of a TCP connection that one of its
2932 clients managing that should tickled with an ACK when IP takeover is
2933 done
2935 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2937 struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2938 struct ctdb_tcp_array *tcparray;
2939 struct ctdb_tcp_connection tcp;
2940 struct ctdb_vnn *vnn;
2942 /* If we don't have public IPs, tickles are useless */
2943 if (ctdb->vnn == NULL) {
2944 return 0;
2947 vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2948 if (vnn == NULL) {
2949 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2950 ctdb_addr_to_str(&p->dst_addr)));
2952 return -1;
2956 tcparray = vnn->tcp_array;
2958 /* If this is the first tickle */
2959 if (tcparray == NULL) {
2960 tcparray = talloc(vnn, struct ctdb_tcp_array);
2961 CTDB_NO_MEMORY(ctdb, tcparray);
2962 vnn->tcp_array = tcparray;
2964 tcparray->num = 0;
2965 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2966 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2968 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2969 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2970 tcparray->num++;
2972 if (tcp_update_needed) {
2973 vnn->tcp_update_needed = true;
2975 return 0;
2979 /* Do we already have this tickle ?*/
2980 tcp.src_addr = p->src_addr;
2981 tcp.dst_addr = p->dst_addr;
2982 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2983 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2984 ctdb_addr_to_str(&tcp.dst_addr),
2985 ntohs(tcp.dst_addr.ip.sin_port),
2986 vnn->pnn));
2987 return 0;
2990 /* A new tickle, we must add it to the array */
2991 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2992 struct ctdb_tcp_connection,
2993 tcparray->num+1);
2994 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2996 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2997 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2998 tcparray->num++;
3000 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
3001 ctdb_addr_to_str(&tcp.dst_addr),
3002 ntohs(tcp.dst_addr.ip.sin_port),
3003 vnn->pnn));
3005 if (tcp_update_needed) {
3006 vnn->tcp_update_needed = true;
3009 return 0;
3014 called by a daemon to inform us of a TCP connection that one of its
3015 clients managing that should tickled with an ACK when IP takeover is
3016 done
3018 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
3020 struct ctdb_tcp_connection *tcpp;
3021 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
3023 if (vnn == NULL) {
3024 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
3025 ctdb_addr_to_str(&conn->dst_addr)));
3026 return;
3029 /* if the array is empty we cant remove it
3030 and we dont need to do anything
3032 if (vnn->tcp_array == NULL) {
3033 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3034 ctdb_addr_to_str(&conn->dst_addr),
3035 ntohs(conn->dst_addr.ip.sin_port)));
3036 return;
3040 /* See if we know this connection
3041 if we dont know this connection then we dont need to do anything
3043 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3044 if (tcpp == NULL) {
3045 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3046 ctdb_addr_to_str(&conn->dst_addr),
3047 ntohs(conn->dst_addr.ip.sin_port)));
3048 return;
3052 /* We need to remove this entry from the array.
3053 Instead of allocating a new array and copying data to it
3054 we cheat and just copy the last entry in the existing array
3055 to the entry that is to be removed and just shring the
3056 ->num field
3058 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3059 vnn->tcp_array->num--;
3061 /* If we deleted the last entry we also need to remove the entire array
3063 if (vnn->tcp_array->num == 0) {
3064 talloc_free(vnn->tcp_array);
3065 vnn->tcp_array = NULL;
3068 vnn->tcp_update_needed = true;
3070 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3071 ctdb_addr_to_str(&conn->src_addr),
3072 ntohs(conn->src_addr.ip.sin_port)));
3077 called by a daemon to inform us of a TCP connection that one of its
3078 clients used are no longer needed in the tickle database
3080 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3082 struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3084 /* If we don't have public IPs, tickles are useless */
3085 if (ctdb->vnn == NULL) {
3086 return 0;
3089 ctdb_remove_tcp_connection(ctdb, conn);
3091 return 0;
3096 Called when another daemon starts - causes all tickles for all
3097 public addresses we are serving to be sent to the new node on the
3098 next check. This actually causes the next scheduled call to
3099 tdb_update_tcp_tickles() to update all nodes. This is simple and
3100 doesn't require careful error handling.
3102 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
3104 struct ctdb_vnn *vnn;
3106 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
3107 (unsigned long) pnn));
3109 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
3110 vnn->tcp_update_needed = true;
3113 return 0;
3118 called when a client structure goes away - hook to remove
3119 elements from the tcp_list in all daemons
3121 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3123 while (client->tcp_list) {
3124 struct ctdb_tcp_list *tcp = client->tcp_list;
3125 DLIST_REMOVE(client->tcp_list, tcp);
3126 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3131 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3133 struct ctdb_vnn *vnn;
3134 int count = 0;
3136 if (ctdb->tunable.disable_ip_failover == 1) {
3137 return;
3140 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3141 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3142 ctdb_vnn_unassign_iface(ctdb, vnn);
3143 continue;
3145 if (!vnn->iface) {
3146 continue;
3149 /* Don't allow multiple releases at once. Some code,
3150 * particularly ctdb_tickle_sentenced_connections() is
3151 * not re-entrant */
3152 if (vnn->update_in_flight) {
3153 DEBUG(DEBUG_WARNING,
3154 (__location__
3155 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
3156 ctdb_addr_to_str(&vnn->public_address),
3157 vnn->public_netmask_bits,
3158 ctdb_vnn_iface_string(vnn)));
3159 continue;
3161 vnn->update_in_flight = true;
3163 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
3164 ctdb_addr_to_str(&vnn->public_address),
3165 vnn->public_netmask_bits,
3166 ctdb_vnn_iface_string(vnn)));
3168 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3169 ctdb_vnn_iface_string(vnn),
3170 ctdb_addr_to_str(&vnn->public_address),
3171 vnn->public_netmask_bits);
3172 release_kill_clients(ctdb, &vnn->public_address);
3173 ctdb_vnn_unassign_iface(ctdb, vnn);
3174 vnn->update_in_flight = false;
3175 count++;
3178 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
3183 get list of public IPs
3185 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3186 struct ctdb_req_control *c, TDB_DATA *outdata)
3188 int i, num, len;
3189 struct ctdb_all_public_ips *ips;
3190 struct ctdb_vnn *vnn;
3191 bool only_available = false;
3193 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3194 only_available = true;
3197 /* count how many public ip structures we have */
3198 num = 0;
3199 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3200 num++;
3203 len = offsetof(struct ctdb_all_public_ips, ips) +
3204 num*sizeof(struct ctdb_public_ip);
3205 ips = talloc_zero_size(outdata, len);
3206 CTDB_NO_MEMORY(ctdb, ips);
3208 i = 0;
3209 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3210 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3211 continue;
3213 ips->ips[i].pnn = vnn->pnn;
3214 ips->ips[i].addr = vnn->public_address;
3215 i++;
3217 ips->num = i;
3218 len = offsetof(struct ctdb_all_public_ips, ips) +
3219 i*sizeof(struct ctdb_public_ip);
3221 outdata->dsize = len;
3222 outdata->dptr = (uint8_t *)ips;
3224 return 0;
3228 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3229 struct ctdb_req_control *c,
3230 TDB_DATA indata,
3231 TDB_DATA *outdata)
3233 int i, num, len;
3234 ctdb_sock_addr *addr;
3235 struct ctdb_control_public_ip_info *info;
3236 struct ctdb_vnn *vnn;
3238 addr = (ctdb_sock_addr *)indata.dptr;
3240 vnn = find_public_ip_vnn(ctdb, addr);
3241 if (vnn == NULL) {
3242 /* if it is not a public ip it could be our 'single ip' */
3243 if (ctdb->single_ip_vnn) {
3244 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3245 vnn = ctdb->single_ip_vnn;
3249 if (vnn == NULL) {
3250 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3251 "'%s'not a public address\n",
3252 ctdb_addr_to_str(addr)));
3253 return -1;
3256 /* count how many public ip structures we have */
3257 num = 0;
3258 for (;vnn->ifaces[num];) {
3259 num++;
3262 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3263 num*sizeof(struct ctdb_control_iface_info);
3264 info = talloc_zero_size(outdata, len);
3265 CTDB_NO_MEMORY(ctdb, info);
3267 info->ip.addr = vnn->public_address;
3268 info->ip.pnn = vnn->pnn;
3269 info->active_idx = 0xFFFFFFFF;
3271 for (i=0; vnn->ifaces[i]; i++) {
3272 struct ctdb_iface *cur;
3274 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3275 if (cur == NULL) {
3276 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3277 vnn->ifaces[i]));
3278 return -1;
3280 if (vnn->iface == cur) {
3281 info->active_idx = i;
3283 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
3284 info->ifaces[i].link_state = cur->link_up;
3285 info->ifaces[i].references = cur->references;
3287 info->num = i;
3288 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3289 i*sizeof(struct ctdb_control_iface_info);
3291 outdata->dsize = len;
3292 outdata->dptr = (uint8_t *)info;
3294 return 0;
3297 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3298 struct ctdb_req_control *c,
3299 TDB_DATA *outdata)
3301 int i, num, len;
3302 struct ctdb_control_get_ifaces *ifaces;
3303 struct ctdb_iface *cur;
3305 /* count how many public ip structures we have */
3306 num = 0;
3307 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3308 num++;
3311 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3312 num*sizeof(struct ctdb_control_iface_info);
3313 ifaces = talloc_zero_size(outdata, len);
3314 CTDB_NO_MEMORY(ctdb, ifaces);
3316 i = 0;
3317 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3318 strcpy(ifaces->ifaces[i].name, cur->name);
3319 ifaces->ifaces[i].link_state = cur->link_up;
3320 ifaces->ifaces[i].references = cur->references;
3321 i++;
3323 ifaces->num = i;
3324 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3325 i*sizeof(struct ctdb_control_iface_info);
3327 outdata->dsize = len;
3328 outdata->dptr = (uint8_t *)ifaces;
3330 return 0;
3333 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3334 struct ctdb_req_control *c,
3335 TDB_DATA indata)
3337 struct ctdb_control_iface_info *info;
3338 struct ctdb_iface *iface;
3339 bool link_up = false;
3341 info = (struct ctdb_control_iface_info *)indata.dptr;
3343 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3344 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3345 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3346 len, len, info->name));
3347 return -1;
3350 switch (info->link_state) {
3351 case 0:
3352 link_up = false;
3353 break;
3354 case 1:
3355 link_up = true;
3356 break;
3357 default:
3358 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3359 (unsigned int)info->link_state));
3360 return -1;
3363 if (info->references != 0) {
3364 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3365 (unsigned int)info->references));
3366 return -1;
3369 iface = ctdb_find_iface(ctdb, info->name);
3370 if (iface == NULL) {
3371 return -1;
3374 if (link_up == iface->link_up) {
3375 return 0;
3378 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3379 ("iface[%s] has changed it's link status %s => %s\n",
3380 iface->name,
3381 iface->link_up?"up":"down",
3382 link_up?"up":"down"));
3384 iface->link_up = link_up;
3385 return 0;
3390 structure containing the listening socket and the list of tcp connections
3391 that the ctdb daemon is to kill
3393 struct ctdb_kill_tcp {
3394 struct ctdb_vnn *vnn;
3395 struct ctdb_context *ctdb;
3396 int capture_fd;
3397 struct fd_event *fde;
3398 trbt_tree_t *connections;
3399 void *private_data;
3403 a tcp connection that is to be killed
3405 struct ctdb_killtcp_con {
3406 ctdb_sock_addr src_addr;
3407 ctdb_sock_addr dst_addr;
3408 int count;
3409 struct ctdb_kill_tcp *killtcp;
3412 /* this function is used to create a key to represent this socketpair
3413 in the killtcp tree.
3414 this key is used to insert and lookup matching socketpairs that are
3415 to be tickled and RST
3417 #define KILLTCP_KEYLEN 10
3418 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3420 static uint32_t key[KILLTCP_KEYLEN];
3422 bzero(key, sizeof(key));
3424 if (src->sa.sa_family != dst->sa.sa_family) {
3425 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3426 return key;
3429 switch (src->sa.sa_family) {
3430 case AF_INET:
3431 key[0] = dst->ip.sin_addr.s_addr;
3432 key[1] = src->ip.sin_addr.s_addr;
3433 key[2] = dst->ip.sin_port;
3434 key[3] = src->ip.sin_port;
3435 break;
3436 case AF_INET6: {
3437 uint32_t *dst6_addr32 =
3438 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3439 uint32_t *src6_addr32 =
3440 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3441 key[0] = dst6_addr32[3];
3442 key[1] = src6_addr32[3];
3443 key[2] = dst6_addr32[2];
3444 key[3] = src6_addr32[2];
3445 key[4] = dst6_addr32[1];
3446 key[5] = src6_addr32[1];
3447 key[6] = dst6_addr32[0];
3448 key[7] = src6_addr32[0];
3449 key[8] = dst->ip6.sin6_port;
3450 key[9] = src->ip6.sin6_port;
3451 break;
3453 default:
3454 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3455 return key;
3458 return key;
3462 called when we get a read event on the raw socket
3464 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
3465 uint16_t flags, void *private_data)
3467 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3468 struct ctdb_killtcp_con *con;
3469 ctdb_sock_addr src, dst;
3470 uint32_t ack_seq, seq;
3472 if (!(flags & EVENT_FD_READ)) {
3473 return;
3476 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3477 killtcp->private_data,
3478 &src, &dst,
3479 &ack_seq, &seq) != 0) {
3480 /* probably a non-tcp ACK packet */
3481 return;
3484 /* check if we have this guy in our list of connections
3485 to kill
3487 con = trbt_lookuparray32(killtcp->connections,
3488 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3489 if (con == NULL) {
3490 /* no this was some other packet we can just ignore */
3491 return;
3494 /* This one has been tickled !
3495 now reset him and remove him from the list.
3497 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3498 ntohs(con->dst_addr.ip.sin_port),
3499 ctdb_addr_to_str(&con->src_addr),
3500 ntohs(con->src_addr.ip.sin_port)));
3502 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3503 talloc_free(con);
3507 /* when traversing the list of all tcp connections to send tickle acks to
3508 (so that we can capture the ack coming back and kill the connection
3509 by a RST)
3510 this callback is called for each connection we are currently trying to kill
3512 static int tickle_connection_traverse(void *param, void *data)
3514 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3516 /* have tried too many times, just give up */
3517 if (con->count >= 5) {
3518 /* can't delete in traverse: reparent to delete_cons */
3519 talloc_steal(param, con);
3520 return 0;
3523 /* othervise, try tickling it again */
3524 con->count++;
3525 ctdb_sys_send_tcp(
3526 (ctdb_sock_addr *)&con->dst_addr,
3527 (ctdb_sock_addr *)&con->src_addr,
3528 0, 0, 0);
3529 return 0;
3534 called every second until all sentenced connections have been reset
3536 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
3537 struct timeval t, void *private_data)
3539 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3540 void *delete_cons = talloc_new(NULL);
3542 /* loop over all connections sending tickle ACKs */
3543 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3545 /* now we've finished traverse, it's safe to do deletion. */
3546 talloc_free(delete_cons);
3548 /* If there are no more connections to kill we can remove the
3549 entire killtcp structure
3551 if ( (killtcp->connections == NULL) ||
3552 (killtcp->connections->root == NULL) ) {
3553 talloc_free(killtcp);
3554 return;
3557 /* try tickling them again in a seconds time
3559 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3560 ctdb_tickle_sentenced_connections, killtcp);
3564 destroy the killtcp structure
3566 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3568 struct ctdb_vnn *tmpvnn;
3570 /* verify that this vnn is still active */
3571 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3572 if (tmpvnn == killtcp->vnn) {
3573 break;
3577 if (tmpvnn == NULL) {
3578 return 0;
3581 if (killtcp->vnn->killtcp != killtcp) {
3582 return 0;
3585 killtcp->vnn->killtcp = NULL;
3587 return 0;
3591 /* nothing fancy here, just unconditionally replace any existing
3592 connection structure with the new one.
3594 dont even free the old one if it did exist, that one is talloc_stolen
3595 by the same node in the tree anyway and will be deleted when the new data
3596 is deleted
3598 static void *add_killtcp_callback(void *parm, void *data)
3600 return parm;
3604 add a tcp socket to the list of connections we want to RST
3606 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3607 ctdb_sock_addr *s,
3608 ctdb_sock_addr *d)
3610 ctdb_sock_addr src, dst;
3611 struct ctdb_kill_tcp *killtcp;
3612 struct ctdb_killtcp_con *con;
3613 struct ctdb_vnn *vnn;
3615 ctdb_canonicalize_ip(s, &src);
3616 ctdb_canonicalize_ip(d, &dst);
3618 vnn = find_public_ip_vnn(ctdb, &dst);
3619 if (vnn == NULL) {
3620 vnn = find_public_ip_vnn(ctdb, &src);
3622 if (vnn == NULL) {
3623 /* if it is not a public ip it could be our 'single ip' */
3624 if (ctdb->single_ip_vnn) {
3625 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3626 vnn = ctdb->single_ip_vnn;
3630 if (vnn == NULL) {
3631 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3632 return -1;
3635 killtcp = vnn->killtcp;
3637 /* If this is the first connection to kill we must allocate
3638 a new structure
3640 if (killtcp == NULL) {
3641 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3642 CTDB_NO_MEMORY(ctdb, killtcp);
3644 killtcp->vnn = vnn;
3645 killtcp->ctdb = ctdb;
3646 killtcp->capture_fd = -1;
3647 killtcp->connections = trbt_create(killtcp, 0);
3649 vnn->killtcp = killtcp;
3650 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3655 /* create a structure that describes this connection we want to
3656 RST and store it in killtcp->connections
3658 con = talloc(killtcp, struct ctdb_killtcp_con);
3659 CTDB_NO_MEMORY(ctdb, con);
3660 con->src_addr = src;
3661 con->dst_addr = dst;
3662 con->count = 0;
3663 con->killtcp = killtcp;
3666 trbt_insertarray32_callback(killtcp->connections,
3667 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3668 add_killtcp_callback, con);
3671 If we dont have a socket to listen on yet we must create it
3673 if (killtcp->capture_fd == -1) {
3674 const char *iface = ctdb_vnn_iface_string(vnn);
3675 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3676 if (killtcp->capture_fd == -1) {
3677 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3678 "socket on iface '%s' for killtcp (%s)\n",
3679 iface, strerror(errno)));
3680 goto failed;
3685 if (killtcp->fde == NULL) {
3686 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
3687 EVENT_FD_READ,
3688 capture_tcp_handler, killtcp);
3689 tevent_fd_set_auto_close(killtcp->fde);
3691 /* We also need to set up some events to tickle all these connections
3692 until they are all reset
3694 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3695 ctdb_tickle_sentenced_connections, killtcp);
3698 /* tickle him once now */
3699 ctdb_sys_send_tcp(
3700 &con->dst_addr,
3701 &con->src_addr,
3702 0, 0, 0);
3704 return 0;
3706 failed:
3707 talloc_free(vnn->killtcp);
3708 vnn->killtcp = NULL;
3709 return -1;
3713 kill a TCP connection.
3715 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3717 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3719 return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3723 called by a daemon to inform us of the entire list of TCP tickles for
3724 a particular public address.
3725 this control should only be sent by the node that is currently serving
3726 that public address.
3728 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3730 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3731 struct ctdb_tcp_array *tcparray;
3732 struct ctdb_vnn *vnn;
3734 /* We must at least have tickles.num or else we cant verify the size
3735 of the received data blob
3737 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3738 tickles.connections)) {
3739 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3740 return -1;
3743 /* verify that the size of data matches what we expect */
3744 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3745 tickles.connections)
3746 + sizeof(struct ctdb_tcp_connection)
3747 * list->tickles.num) {
3748 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3749 return -1;
3752 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3753 ctdb_addr_to_str(&list->addr)));
3755 vnn = find_public_ip_vnn(ctdb, &list->addr);
3756 if (vnn == NULL) {
3757 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3758 ctdb_addr_to_str(&list->addr)));
3760 return 1;
3763 /* remove any old ticklelist we might have */
3764 talloc_free(vnn->tcp_array);
3765 vnn->tcp_array = NULL;
3767 tcparray = talloc(vnn, struct ctdb_tcp_array);
3768 CTDB_NO_MEMORY(ctdb, tcparray);
3770 tcparray->num = list->tickles.num;
3772 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3773 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3775 memcpy(tcparray->connections, &list->tickles.connections[0],
3776 sizeof(struct ctdb_tcp_connection)*tcparray->num);
3778 /* We now have a new fresh tickle list array for this vnn */
3779 vnn->tcp_array = tcparray;
3781 return 0;
3785 called to return the full list of tickles for the puclic address associated
3786 with the provided vnn
3788 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3790 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3791 struct ctdb_control_tcp_tickle_list *list;
3792 struct ctdb_tcp_array *tcparray;
3793 int num;
3794 struct ctdb_vnn *vnn;
3796 vnn = find_public_ip_vnn(ctdb, addr);
3797 if (vnn == NULL) {
3798 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3799 ctdb_addr_to_str(addr)));
3801 return 1;
3804 tcparray = vnn->tcp_array;
3805 if (tcparray) {
3806 num = tcparray->num;
3807 } else {
3808 num = 0;
3811 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3812 tickles.connections)
3813 + sizeof(struct ctdb_tcp_connection) * num;
3815 outdata->dptr = talloc_size(outdata, outdata->dsize);
3816 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3817 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3819 list->addr = *addr;
3820 list->tickles.num = num;
3821 if (num) {
3822 memcpy(&list->tickles.connections[0], tcparray->connections,
3823 sizeof(struct ctdb_tcp_connection) * num);
3826 return 0;
3831 set the list of all tcp tickles for a public address
3833 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3834 ctdb_sock_addr *addr,
3835 struct ctdb_tcp_array *tcparray)
3837 int ret, num;
3838 TDB_DATA data;
3839 struct ctdb_control_tcp_tickle_list *list;
3841 if (tcparray) {
3842 num = tcparray->num;
3843 } else {
3844 num = 0;
3847 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3848 tickles.connections) +
3849 sizeof(struct ctdb_tcp_connection) * num;
3850 data.dptr = talloc_size(ctdb, data.dsize);
3851 CTDB_NO_MEMORY(ctdb, data.dptr);
3853 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3854 list->addr = *addr;
3855 list->tickles.num = num;
3856 if (tcparray) {
3857 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3860 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3861 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3862 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3863 if (ret != 0) {
3864 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3865 return -1;
3868 talloc_free(data.dptr);
3870 return ret;
3875 perform tickle updates if required
3877 static void ctdb_update_tcp_tickles(struct event_context *ev,
3878 struct timed_event *te,
3879 struct timeval t, void *private_data)
3881 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3882 int ret;
3883 struct ctdb_vnn *vnn;
3885 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3886 /* we only send out updates for public addresses that
3887 we have taken over
3889 if (ctdb->pnn != vnn->pnn) {
3890 continue;
3892 /* We only send out the updates if we need to */
3893 if (!vnn->tcp_update_needed) {
3894 continue;
3896 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3897 &vnn->public_address,
3898 vnn->tcp_array);
3899 if (ret != 0) {
3900 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3901 ctdb_addr_to_str(&vnn->public_address)));
3902 } else {
3903 DEBUG(DEBUG_INFO,
3904 ("Sent tickle update for public address %s\n",
3905 ctdb_addr_to_str(&vnn->public_address)));
3906 vnn->tcp_update_needed = false;
3910 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3911 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3912 ctdb_update_tcp_tickles, ctdb);
3917 start periodic update of tcp tickles
3919 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3921 ctdb->tickle_update_context = talloc_new(ctdb);
3923 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3924 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3925 ctdb_update_tcp_tickles, ctdb);
3931 struct control_gratious_arp {
3932 struct ctdb_context *ctdb;
3933 ctdb_sock_addr addr;
3934 const char *iface;
3935 int count;
3939 send a control_gratuitous arp
3941 static void send_gratious_arp(struct event_context *ev, struct timed_event *te,
3942 struct timeval t, void *private_data)
3944 int ret;
3945 struct control_gratious_arp *arp = talloc_get_type(private_data,
3946 struct control_gratious_arp);
3948 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3949 if (ret != 0) {
3950 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3951 arp->iface, strerror(errno)));
3955 arp->count++;
3956 if (arp->count == CTDB_ARP_REPEAT) {
3957 talloc_free(arp);
3958 return;
3961 event_add_timed(arp->ctdb->ev, arp,
3962 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3963 send_gratious_arp, arp);
3968 send a gratious arp
3970 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3972 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3973 struct control_gratious_arp *arp;
3975 /* verify the size of indata */
3976 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3977 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
3978 (unsigned)indata.dsize,
3979 (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3980 return -1;
3982 if (indata.dsize !=
3983 ( offsetof(struct ctdb_control_gratious_arp, iface)
3984 + gratious_arp->len ) ){
3986 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3987 "but should be %u bytes\n",
3988 (unsigned)indata.dsize,
3989 (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3990 return -1;
3994 arp = talloc(ctdb, struct control_gratious_arp);
3995 CTDB_NO_MEMORY(ctdb, arp);
3997 arp->ctdb = ctdb;
3998 arp->addr = gratious_arp->addr;
3999 arp->iface = talloc_strdup(arp, gratious_arp->iface);
4000 CTDB_NO_MEMORY(ctdb, arp->iface);
4001 arp->count = 0;
4003 event_add_timed(arp->ctdb->ev, arp,
4004 timeval_zero(), send_gratious_arp, arp);
4006 return 0;
4009 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4011 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4012 int ret;
4014 /* verify the size of indata */
4015 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4016 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4017 return -1;
4019 if (indata.dsize !=
4020 ( offsetof(struct ctdb_control_ip_iface, iface)
4021 + pub->len ) ){
4023 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4024 "but should be %u bytes\n",
4025 (unsigned)indata.dsize,
4026 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4027 return -1;
4030 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
4032 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
4034 if (ret != 0) {
4035 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4036 return -1;
4039 return 0;
4042 struct delete_ip_callback_state {
4043 struct ctdb_req_control *c;
4047 called when releaseip event finishes for del_public_address
4049 static void delete_ip_callback(struct ctdb_context *ctdb,
4050 int32_t status, TDB_DATA data,
4051 const char *errormsg,
4052 void *private_data)
4054 struct delete_ip_callback_state *state =
4055 talloc_get_type(private_data, struct delete_ip_callback_state);
4057 /* If release failed then fail. */
4058 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
4059 talloc_free(private_data);
4062 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
4063 struct ctdb_req_control *c,
4064 TDB_DATA indata, bool *async_reply)
4066 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4067 struct ctdb_vnn *vnn;
4069 /* verify the size of indata */
4070 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4071 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4072 return -1;
4074 if (indata.dsize !=
4075 ( offsetof(struct ctdb_control_ip_iface, iface)
4076 + pub->len ) ){
4078 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4079 "but should be %u bytes\n",
4080 (unsigned)indata.dsize,
4081 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4082 return -1;
4085 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4087 /* walk over all public addresses until we find a match */
4088 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4089 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4090 if (vnn->pnn == ctdb->pnn) {
4091 struct delete_ip_callback_state *state;
4092 struct ctdb_public_ip *ip;
4093 TDB_DATA data;
4094 int ret;
4096 vnn->delete_pending = true;
4098 state = talloc(ctdb,
4099 struct delete_ip_callback_state);
4100 CTDB_NO_MEMORY(ctdb, state);
4101 state->c = c;
4103 ip = talloc(state, struct ctdb_public_ip);
4104 if (ip == NULL) {
4105 DEBUG(DEBUG_ERR,
4106 (__location__ " Out of memory\n"));
4107 talloc_free(state);
4108 return -1;
4110 ip->pnn = -1;
4111 ip->addr = pub->addr;
4113 data.dsize = sizeof(struct ctdb_public_ip);
4114 data.dptr = (unsigned char *)ip;
4116 ret = ctdb_daemon_send_control(ctdb,
4117 ctdb_get_pnn(ctdb),
4119 CTDB_CONTROL_RELEASE_IP,
4120 0, 0,
4121 data,
4122 delete_ip_callback,
4123 state);
4124 if (ret == -1) {
4125 DEBUG(DEBUG_ERR,
4126 (__location__ "Unable to send "
4127 "CTDB_CONTROL_RELEASE_IP\n"));
4128 talloc_free(state);
4129 return -1;
4132 state->c = talloc_steal(state, c);
4133 *async_reply = true;
4134 } else {
4135 /* This IP is not hosted on the
4136 * current node so just delete it
4137 * now. */
4138 do_delete_ip(ctdb, vnn);
4141 return 0;
4145 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
4146 ctdb_addr_to_str(&pub->addr)));
4147 return -1;
4151 struct ipreallocated_callback_state {
4152 struct ctdb_req_control *c;
4155 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4156 int status, void *p)
4158 struct ipreallocated_callback_state *state =
4159 talloc_get_type(p, struct ipreallocated_callback_state);
4161 if (status != 0) {
4162 DEBUG(DEBUG_ERR,
4163 (" \"ipreallocated\" event script failed (status %d)\n",
4164 status));
4165 if (status == -ETIME) {
4166 ctdb_ban_self(ctdb);
4170 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4171 talloc_free(state);
4174 /* A control to run the ipreallocated event */
4175 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4176 struct ctdb_req_control *c,
4177 bool *async_reply)
4179 int ret;
4180 struct ipreallocated_callback_state *state;
4182 state = talloc(ctdb, struct ipreallocated_callback_state);
4183 CTDB_NO_MEMORY(ctdb, state);
4185 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4187 ret = ctdb_event_script_callback(ctdb, state,
4188 ctdb_ipreallocated_callback, state,
4189 CTDB_EVENT_IPREALLOCATED,
4190 "%s", "");
4192 if (ret != 0) {
4193 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4194 talloc_free(state);
4195 return -1;
4198 /* tell the control that we will be reply asynchronously */
4199 state->c = talloc_steal(state, c);
4200 *async_reply = true;
4202 return 0;
4206 /* This function is called from the recovery daemon to verify that a remote
4207 node has the expected ip allocation.
4208 This is verified against ctdb->ip_tree
4210 int verify_remote_ip_allocation(struct ctdb_context *ctdb,
4211 struct ctdb_all_public_ips *ips,
4212 uint32_t pnn)
4214 struct ctdb_public_ip_list *tmp_ip;
4215 int i;
4217 if (ctdb->ip_tree == NULL) {
4218 /* dont know the expected allocation yet, assume remote node
4219 is correct. */
4220 return 0;
4223 if (ips == NULL) {
4224 return 0;
4227 for (i=0; i<ips->num; i++) {
4228 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4229 if (tmp_ip == NULL) {
4230 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
4231 return -1;
4234 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4235 continue;
4238 if (tmp_ip->pnn != ips->ips[i].pnn) {
4239 DEBUG(DEBUG_ERR,
4240 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
4241 pnn,
4242 ctdb_addr_to_str(&ips->ips[i].addr),
4243 ips->ips[i].pnn, tmp_ip->pnn));
4244 return -1;
4248 return 0;
4251 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4253 struct ctdb_public_ip_list *tmp_ip;
4255 /* IP tree is never built if DisableIPFailover is set */
4256 if (ctdb->tunable.disable_ip_failover != 0) {
4257 return 0;
4260 if (ctdb->ip_tree == NULL) {
4261 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4262 return -1;
4265 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4266 if (tmp_ip == NULL) {
4267 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4268 return -1;
4271 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4272 tmp_ip->pnn = ip->pnn;
4274 return 0;
4277 void clear_ip_assignment_tree(struct ctdb_context *ctdb)
4279 TALLOC_FREE(ctdb->ip_tree);
4282 struct ctdb_reloadips_handle {
4283 struct ctdb_context *ctdb;
4284 struct ctdb_req_control *c;
4285 int status;
4286 int fd[2];
4287 pid_t child;
4288 struct fd_event *fde;
4291 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4293 if (h == h->ctdb->reload_ips) {
4294 h->ctdb->reload_ips = NULL;
4296 if (h->c != NULL) {
4297 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4298 h->c = NULL;
4300 ctdb_kill(h->ctdb, h->child, SIGKILL);
4301 return 0;
4304 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4305 struct timed_event *te,
4306 struct timeval t, void *private_data)
4308 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4310 talloc_free(h);
4313 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde,
4314 uint16_t flags, void *private_data)
4316 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4318 char res;
4319 int ret;
4321 ret = sys_read(h->fd[0], &res, 1);
4322 if (ret < 1 || res != 0) {
4323 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4324 res = 1;
4326 h->status = res;
4328 talloc_free(h);
4331 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4333 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4334 struct ctdb_all_public_ips *ips;
4335 struct ctdb_vnn *vnn;
4336 struct client_async_data *async_data;
4337 struct timeval timeout;
4338 TDB_DATA data;
4339 struct ctdb_client_control_state *state;
4340 bool first_add;
4341 int i, ret;
4343 CTDB_NO_MEMORY(ctdb, mem_ctx);
4345 /* Read IPs from local node */
4346 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
4347 CTDB_CURRENT_NODE, mem_ctx, &ips);
4348 if (ret != 0) {
4349 DEBUG(DEBUG_ERR,
4350 ("Unable to fetch public IPs from local node\n"));
4351 talloc_free(mem_ctx);
4352 return -1;
4355 /* Read IPs file - this is safe since this is a child process */
4356 ctdb->vnn = NULL;
4357 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4358 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4359 talloc_free(mem_ctx);
4360 return -1;
4363 async_data = talloc_zero(mem_ctx, struct client_async_data);
4364 CTDB_NO_MEMORY(ctdb, async_data);
4366 /* Compare IPs between node and file for IPs to be deleted */
4367 for (i = 0; i < ips->num; i++) {
4368 /* */
4369 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4370 if (ctdb_same_ip(&vnn->public_address,
4371 &ips->ips[i].addr)) {
4372 /* IP is still in file */
4373 break;
4377 if (vnn == NULL) {
4378 /* Delete IP ips->ips[i] */
4379 struct ctdb_control_ip_iface *pub;
4381 DEBUG(DEBUG_NOTICE,
4382 ("IP %s no longer configured, deleting it\n",
4383 ctdb_addr_to_str(&ips->ips[i].addr)));
4385 pub = talloc_zero(mem_ctx,
4386 struct ctdb_control_ip_iface);
4387 CTDB_NO_MEMORY(ctdb, pub);
4389 pub->addr = ips->ips[i].addr;
4390 pub->mask = 0;
4391 pub->len = 0;
4393 timeout = TAKEOVER_TIMEOUT();
4395 data.dsize = offsetof(struct ctdb_control_ip_iface,
4396 iface) + pub->len;
4397 data.dptr = (uint8_t *)pub;
4399 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4400 CTDB_CONTROL_DEL_PUBLIC_IP,
4401 0, data, async_data,
4402 &timeout, NULL);
4403 if (state == NULL) {
4404 DEBUG(DEBUG_ERR,
4405 (__location__
4406 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
4407 goto failed;
4410 ctdb_client_async_add(async_data, state);
4414 /* Compare IPs between node and file for IPs to be added */
4415 first_add = true;
4416 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4417 for (i = 0; i < ips->num; i++) {
4418 if (ctdb_same_ip(&vnn->public_address,
4419 &ips->ips[i].addr)) {
4420 /* IP already on node */
4421 break;
4424 if (i == ips->num) {
4425 /* Add IP ips->ips[i] */
4426 struct ctdb_control_ip_iface *pub;
4427 const char *ifaces = NULL;
4428 uint32_t len;
4429 int iface = 0;
4431 DEBUG(DEBUG_NOTICE,
4432 ("New IP %s configured, adding it\n",
4433 ctdb_addr_to_str(&vnn->public_address)));
4434 if (first_add) {
4435 uint32_t pnn = ctdb_get_pnn(ctdb);
4437 data.dsize = sizeof(pnn);
4438 data.dptr = (uint8_t *)&pnn;
4440 ret = ctdb_client_send_message(
4441 ctdb,
4442 CTDB_BROADCAST_CONNECTED,
4443 CTDB_SRVID_REBALANCE_NODE,
4444 data);
4445 if (ret != 0) {
4446 DEBUG(DEBUG_WARNING,
4447 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
4450 first_add = false;
4453 ifaces = vnn->ifaces[0];
4454 iface = 1;
4455 while (vnn->ifaces[iface] != NULL) {
4456 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
4457 vnn->ifaces[iface]);
4458 iface++;
4461 len = strlen(ifaces) + 1;
4462 pub = talloc_zero_size(mem_ctx,
4463 offsetof(struct ctdb_control_ip_iface, iface) + len);
4464 CTDB_NO_MEMORY(ctdb, pub);
4466 pub->addr = vnn->public_address;
4467 pub->mask = vnn->public_netmask_bits;
4468 pub->len = len;
4469 memcpy(&pub->iface[0], ifaces, pub->len);
4471 timeout = TAKEOVER_TIMEOUT();
4473 data.dsize = offsetof(struct ctdb_control_ip_iface,
4474 iface) + pub->len;
4475 data.dptr = (uint8_t *)pub;
4477 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
4478 CTDB_CONTROL_ADD_PUBLIC_IP,
4479 0, data, async_data,
4480 &timeout, NULL);
4481 if (state == NULL) {
4482 DEBUG(DEBUG_ERR,
4483 (__location__
4484 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
4485 goto failed;
4488 ctdb_client_async_add(async_data, state);
4492 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
4493 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
4494 goto failed;
4497 talloc_free(mem_ctx);
4498 return 0;
4500 failed:
4501 talloc_free(mem_ctx);
4502 return -1;
4505 /* This control is sent to force the node to re-read the public addresses file
4506 and drop any addresses we should nnot longer host, and add new addresses
4507 that we are now able to host
4509 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4511 struct ctdb_reloadips_handle *h;
4512 pid_t parent = getpid();
4514 if (ctdb->reload_ips != NULL) {
4515 talloc_free(ctdb->reload_ips);
4516 ctdb->reload_ips = NULL;
4519 h = talloc(ctdb, struct ctdb_reloadips_handle);
4520 CTDB_NO_MEMORY(ctdb, h);
4521 h->ctdb = ctdb;
4522 h->c = NULL;
4523 h->status = -1;
4525 if (pipe(h->fd) == -1) {
4526 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4527 talloc_free(h);
4528 return -1;
4531 h->child = ctdb_fork(ctdb);
4532 if (h->child == (pid_t)-1) {
4533 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4534 close(h->fd[0]);
4535 close(h->fd[1]);
4536 talloc_free(h);
4537 return -1;
4540 /* child process */
4541 if (h->child == 0) {
4542 signed char res = 0;
4544 close(h->fd[0]);
4545 debug_extra = talloc_asprintf(NULL, "reloadips:");
4547 ctdb_set_process_name("ctdb_reloadips");
4548 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4549 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4550 res = -1;
4551 } else {
4552 res = ctdb_reloadips_child(ctdb);
4553 if (res != 0) {
4554 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4558 sys_write(h->fd[1], &res, 1);
4559 /* make sure we die when our parent dies */
4560 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4561 sleep(5);
4563 _exit(0);
4566 h->c = talloc_steal(h, c);
4568 close(h->fd[1]);
4569 set_close_on_exec(h->fd[0]);
4571 talloc_set_destructor(h, ctdb_reloadips_destructor);
4574 h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4575 EVENT_FD_READ, ctdb_reloadips_child_handler,
4576 (void *)h);
4577 tevent_fd_set_auto_close(h->fde);
4579 event_add_timed(ctdb->ev, h,
4580 timeval_current_ofs(120, 0),
4581 ctdb_reloadips_timeout_event, h);
4583 /* we reply later */
4584 *async_reply = true;
4585 return 0;