recoverd: Nodes can only takeover IPs if they are in runstate RUNNING
[Samba/wip.git] / ctdb / server / ctdb_takeover.c
blobb5e7f87c69578c820ffb0a2a5e3d502c28e51b35
1 /*
2 ctdb ip takeover code
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "includes.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
36 /* Flags used in IP allocation algorithms. */
37 struct ctdb_ipflags {
38 bool noiptakeover;
39 bool noiphost;
42 struct ctdb_iface {
43 struct ctdb_iface *prev, *next;
44 const char *name;
45 bool link_up;
46 uint32_t references;
49 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
51 if (vnn->iface) {
52 return vnn->iface->name;
55 return "__none__";
58 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
60 struct ctdb_iface *i;
62 /* Verify that we dont have an entry for this ip yet */
63 for (i=ctdb->ifaces;i;i=i->next) {
64 if (strcmp(i->name, iface) == 0) {
65 return 0;
69 /* create a new structure for this interface */
70 i = talloc_zero(ctdb, struct ctdb_iface);
71 CTDB_NO_MEMORY_FATAL(ctdb, i);
72 i->name = talloc_strdup(i, iface);
73 CTDB_NO_MEMORY(ctdb, i->name);
75 * If link_up defaults to true then IPs can be allocated to a
76 * node during the first recovery. However, then an interface
77 * could have its link marked down during the startup event,
78 * causing the IP to move almost immediately. If link_up
79 * defaults to false then, during normal operation, IPs added
80 * to a new interface can't be assigned until a monitor cycle
81 * has occurred and marked the new interfaces up. This makes
82 * IP allocation unpredictable. The following is a neat
83 * compromise: early in startup link_up defaults to false, so
84 * IPs can't be assigned, and after startup IPs can be
85 * assigned immediately.
87 i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
89 DLIST_ADD(ctdb->ifaces, i);
91 return 0;
94 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
95 const char *name)
97 int n;
99 for (n = 0; vnn->ifaces[n] != NULL; n++) {
100 if (strcmp(name, vnn->ifaces[n]) == 0) {
101 return true;
105 return false;
108 /* If any interfaces now have no possible IPs then delete them. This
109 * implementation is naive (i.e. simple) rather than clever
110 * (i.e. complex). Given that this is run on delip and that operation
111 * is rare, this doesn't need to be efficient - it needs to be
112 * foolproof. One alternative is reference counting, where the logic
113 * is distributed and can, therefore, be broken in multiple places.
114 * Another alternative is to build a red-black tree of interfaces that
115 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
116 * once) and then walking ctdb->ifaces once and deleting those not in
117 * the tree. Let's go to one of those if the naive implementation
118 * causes problems... :-)
120 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
121 struct ctdb_vnn *vnn,
122 TALLOC_CTX *mem_ctx)
124 struct ctdb_iface *i;
126 /* For each interface, check if there's an IP using it. */
127 for(i=ctdb->ifaces; i; i=i->next) {
128 struct ctdb_vnn *tv;
129 bool found;
131 /* Only consider interfaces named in the given VNN. */
132 if (!vnn_has_interface_with_name(vnn, i->name)) {
133 continue;
136 /* Is the "single IP" on this interface? */
137 if ((ctdb->single_ip_vnn != NULL) &&
138 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
139 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
140 /* Found, next interface please... */
141 continue;
143 /* Search for a vnn with this interface. */
144 found = false;
145 for (tv=ctdb->vnn; tv; tv=tv->next) {
146 if (vnn_has_interface_with_name(tv, i->name)) {
147 found = true;
148 break;
152 if (!found) {
153 /* None of the VNNs are using this interface. */
154 DLIST_REMOVE(ctdb->ifaces, i);
155 /* Caller will free mem_ctx when convenient. */
156 talloc_steal(mem_ctx, i);
162 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
163 const char *iface)
165 struct ctdb_iface *i;
167 for (i=ctdb->ifaces;i;i=i->next) {
168 if (strcmp(i->name, iface) == 0) {
169 return i;
173 return NULL;
176 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
177 struct ctdb_vnn *vnn)
179 int i;
180 struct ctdb_iface *cur = NULL;
181 struct ctdb_iface *best = NULL;
183 for (i=0; vnn->ifaces[i]; i++) {
185 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
186 if (cur == NULL) {
187 continue;
190 if (!cur->link_up) {
191 continue;
194 if (best == NULL) {
195 best = cur;
196 continue;
199 if (cur->references < best->references) {
200 best = cur;
201 continue;
205 return best;
208 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
209 struct ctdb_vnn *vnn)
211 struct ctdb_iface *best = NULL;
213 if (vnn->iface) {
214 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
215 "still assigned to iface '%s'\n",
216 ctdb_addr_to_str(&vnn->public_address),
217 ctdb_vnn_iface_string(vnn)));
218 return 0;
221 best = ctdb_vnn_best_iface(ctdb, vnn);
222 if (best == NULL) {
223 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
224 "cannot assign to iface any iface\n",
225 ctdb_addr_to_str(&vnn->public_address)));
226 return -1;
229 vnn->iface = best;
230 best->references++;
231 vnn->pnn = ctdb->pnn;
233 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
234 "now assigned to iface '%s' refs[%d]\n",
235 ctdb_addr_to_str(&vnn->public_address),
236 ctdb_vnn_iface_string(vnn),
237 best->references));
238 return 0;
241 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
242 struct ctdb_vnn *vnn)
244 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
245 "now unassigned (old iface '%s' refs[%d])\n",
246 ctdb_addr_to_str(&vnn->public_address),
247 ctdb_vnn_iface_string(vnn),
248 vnn->iface?vnn->iface->references:0));
249 if (vnn->iface) {
250 vnn->iface->references--;
252 vnn->iface = NULL;
253 if (vnn->pnn == ctdb->pnn) {
254 vnn->pnn = -1;
258 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
259 struct ctdb_vnn *vnn)
261 int i;
263 if (vnn->iface && vnn->iface->link_up) {
264 return true;
267 for (i=0; vnn->ifaces[i]; i++) {
268 struct ctdb_iface *cur;
270 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
271 if (cur == NULL) {
272 continue;
275 if (cur->link_up) {
276 return true;
280 return false;
283 struct ctdb_takeover_arp {
284 struct ctdb_context *ctdb;
285 uint32_t count;
286 ctdb_sock_addr addr;
287 struct ctdb_tcp_array *tcparray;
288 struct ctdb_vnn *vnn;
293 lists of tcp endpoints
295 struct ctdb_tcp_list {
296 struct ctdb_tcp_list *prev, *next;
297 struct ctdb_tcp_connection connection;
301 list of clients to kill on IP release
303 struct ctdb_client_ip {
304 struct ctdb_client_ip *prev, *next;
305 struct ctdb_context *ctdb;
306 ctdb_sock_addr addr;
307 uint32_t client_id;
312 send a gratuitous arp
314 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
315 struct timeval t, void *private_data)
317 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
318 struct ctdb_takeover_arp);
319 int i, ret;
320 struct ctdb_tcp_array *tcparray;
321 const char *iface = ctdb_vnn_iface_string(arp->vnn);
323 ret = ctdb_sys_send_arp(&arp->addr, iface);
324 if (ret != 0) {
325 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
326 iface, strerror(errno)));
329 tcparray = arp->tcparray;
330 if (tcparray) {
331 for (i=0;i<tcparray->num;i++) {
332 struct ctdb_tcp_connection *tcon;
334 tcon = &tcparray->connections[i];
335 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
336 (unsigned)ntohs(tcon->dst_addr.ip.sin_port),
337 ctdb_addr_to_str(&tcon->src_addr),
338 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
339 ret = ctdb_sys_send_tcp(
340 &tcon->src_addr,
341 &tcon->dst_addr,
342 0, 0, 0);
343 if (ret != 0) {
344 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
345 ctdb_addr_to_str(&tcon->src_addr)));
350 arp->count++;
352 if (arp->count == CTDB_ARP_REPEAT) {
353 talloc_free(arp);
354 return;
357 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
358 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
359 ctdb_control_send_arp, arp);
362 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
363 struct ctdb_vnn *vnn)
365 struct ctdb_takeover_arp *arp;
366 struct ctdb_tcp_array *tcparray;
368 if (!vnn->takeover_ctx) {
369 vnn->takeover_ctx = talloc_new(vnn);
370 if (!vnn->takeover_ctx) {
371 return -1;
375 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
376 if (!arp) {
377 return -1;
380 arp->ctdb = ctdb;
381 arp->addr = vnn->public_address;
382 arp->vnn = vnn;
384 tcparray = vnn->tcp_array;
385 if (tcparray) {
386 /* add all of the known tcp connections for this IP to the
387 list of tcp connections to send tickle acks for */
388 arp->tcparray = talloc_steal(arp, tcparray);
390 vnn->tcp_array = NULL;
391 vnn->tcp_update_needed = true;
394 event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
395 timeval_zero(), ctdb_control_send_arp, arp);
397 return 0;
400 struct takeover_callback_state {
401 struct ctdb_req_control *c;
402 ctdb_sock_addr *addr;
403 struct ctdb_vnn *vnn;
406 struct ctdb_do_takeip_state {
407 struct ctdb_req_control *c;
408 struct ctdb_vnn *vnn;
412 called when takeip event finishes
414 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415 void *private_data)
417 struct ctdb_do_takeip_state *state =
418 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419 int32_t ret;
420 TDB_DATA data;
422 if (status != 0) {
423 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
425 if (status == -ETIME) {
426 ctdb_ban_self(ctdb);
428 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429 ctdb_addr_to_str(&state->vnn->public_address),
430 ctdb_vnn_iface_string(state->vnn)));
431 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
433 node->flags |= NODE_FLAGS_UNHEALTHY;
434 talloc_free(state);
435 return;
438 if (ctdb->do_checkpublicip) {
440 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441 if (ret != 0) {
442 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443 talloc_free(state);
444 return;
449 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450 data.dsize = strlen((char *)data.dptr) + 1;
451 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
453 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
456 /* the control succeeded */
457 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458 talloc_free(state);
459 return;
462 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
464 state->vnn->update_in_flight = false;
465 return 0;
469 take over an ip address
471 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472 struct ctdb_req_control *c,
473 struct ctdb_vnn *vnn)
475 int ret;
476 struct ctdb_do_takeip_state *state;
478 if (vnn->update_in_flight) {
479 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480 "update for this IP already in flight\n",
481 ctdb_addr_to_str(&vnn->public_address),
482 vnn->public_netmask_bits));
483 return -1;
486 ret = ctdb_vnn_assign_iface(ctdb, vnn);
487 if (ret != 0) {
488 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489 "assign a usable interface\n",
490 ctdb_addr_to_str(&vnn->public_address),
491 vnn->public_netmask_bits));
492 return -1;
495 state = talloc(vnn, struct ctdb_do_takeip_state);
496 CTDB_NO_MEMORY(ctdb, state);
498 state->c = talloc_steal(ctdb, c);
499 state->vnn = vnn;
501 vnn->update_in_flight = true;
502 talloc_set_destructor(state, ctdb_takeip_destructor);
504 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505 ctdb_addr_to_str(&vnn->public_address),
506 vnn->public_netmask_bits,
507 ctdb_vnn_iface_string(vnn)));
509 ret = ctdb_event_script_callback(ctdb,
510 state,
511 ctdb_do_takeip_callback,
512 state,
513 false,
514 CTDB_EVENT_TAKE_IP,
515 "%s %s %u",
516 ctdb_vnn_iface_string(vnn),
517 ctdb_addr_to_str(&vnn->public_address),
518 vnn->public_netmask_bits);
520 if (ret != 0) {
521 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
522 ctdb_addr_to_str(&vnn->public_address),
523 ctdb_vnn_iface_string(vnn)));
524 talloc_free(state);
525 return -1;
528 return 0;
531 struct ctdb_do_updateip_state {
532 struct ctdb_req_control *c;
533 struct ctdb_iface *old;
534 struct ctdb_vnn *vnn;
538 called when updateip event finishes
540 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
541 void *private_data)
543 struct ctdb_do_updateip_state *state =
544 talloc_get_type(private_data, struct ctdb_do_updateip_state);
545 int32_t ret;
547 if (status != 0) {
548 if (status == -ETIME) {
549 ctdb_ban_self(ctdb);
551 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
552 ctdb_addr_to_str(&state->vnn->public_address),
553 state->old->name,
554 ctdb_vnn_iface_string(state->vnn)));
557 * All we can do is reset the old interface
558 * and let the next run fix it
560 ctdb_vnn_unassign_iface(ctdb, state->vnn);
561 state->vnn->iface = state->old;
562 state->vnn->iface->references++;
564 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
565 talloc_free(state);
566 return;
569 if (ctdb->do_checkpublicip) {
571 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
572 if (ret != 0) {
573 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
574 talloc_free(state);
575 return;
580 /* the control succeeded */
581 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
582 talloc_free(state);
583 return;
586 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
588 state->vnn->update_in_flight = false;
589 return 0;
593 update (move) an ip address
595 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
596 struct ctdb_req_control *c,
597 struct ctdb_vnn *vnn)
599 int ret;
600 struct ctdb_do_updateip_state *state;
601 struct ctdb_iface *old = vnn->iface;
602 const char *new_name;
604 if (vnn->update_in_flight) {
605 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
606 "update for this IP already in flight\n",
607 ctdb_addr_to_str(&vnn->public_address),
608 vnn->public_netmask_bits));
609 return -1;
612 ctdb_vnn_unassign_iface(ctdb, vnn);
613 ret = ctdb_vnn_assign_iface(ctdb, vnn);
614 if (ret != 0) {
615 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
616 "assin a usable interface (old iface '%s')\n",
617 ctdb_addr_to_str(&vnn->public_address),
618 vnn->public_netmask_bits,
619 old->name));
620 return -1;
623 new_name = ctdb_vnn_iface_string(vnn);
624 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
625 /* A benign update from one interface onto itself.
626 * no need to run the eventscripts in this case, just return
627 * success.
629 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
630 return 0;
633 state = talloc(vnn, struct ctdb_do_updateip_state);
634 CTDB_NO_MEMORY(ctdb, state);
636 state->c = talloc_steal(ctdb, c);
637 state->old = old;
638 state->vnn = vnn;
640 vnn->update_in_flight = true;
641 talloc_set_destructor(state, ctdb_updateip_destructor);
643 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
644 "interface %s to %s\n",
645 ctdb_addr_to_str(&vnn->public_address),
646 vnn->public_netmask_bits,
647 old->name,
648 new_name));
650 ret = ctdb_event_script_callback(ctdb,
651 state,
652 ctdb_do_updateip_callback,
653 state,
654 false,
655 CTDB_EVENT_UPDATE_IP,
656 "%s %s %s %u",
657 state->old->name,
658 new_name,
659 ctdb_addr_to_str(&vnn->public_address),
660 vnn->public_netmask_bits);
661 if (ret != 0) {
662 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
663 ctdb_addr_to_str(&vnn->public_address),
664 old->name, new_name));
665 talloc_free(state);
666 return -1;
669 return 0;
673 Find the vnn of the node that has a public ip address
674 returns -1 if the address is not known as a public address
676 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
678 struct ctdb_vnn *vnn;
680 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
681 if (ctdb_same_ip(&vnn->public_address, addr)) {
682 return vnn;
686 return NULL;
690 take over an ip address
692 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
693 struct ctdb_req_control *c,
694 TDB_DATA indata,
695 bool *async_reply)
697 int ret;
698 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
699 struct ctdb_vnn *vnn;
700 bool have_ip = false;
701 bool do_updateip = false;
702 bool do_takeip = false;
703 struct ctdb_iface *best_iface = NULL;
705 if (pip->pnn != ctdb->pnn) {
706 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
707 "with pnn %d, but we're node %d\n",
708 ctdb_addr_to_str(&pip->addr),
709 pip->pnn, ctdb->pnn));
710 return -1;
713 /* update out vnn list */
714 vnn = find_public_ip_vnn(ctdb, &pip->addr);
715 if (vnn == NULL) {
716 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
717 ctdb_addr_to_str(&pip->addr)));
718 return 0;
721 if (ctdb->do_checkpublicip) {
722 have_ip = ctdb_sys_have_ip(&pip->addr);
724 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
725 if (best_iface == NULL) {
726 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
727 "a usable interface (old %s, have_ip %d)\n",
728 ctdb_addr_to_str(&vnn->public_address),
729 vnn->public_netmask_bits,
730 ctdb_vnn_iface_string(vnn),
731 have_ip));
732 return -1;
735 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
736 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
737 have_ip = false;
741 if (vnn->iface == NULL && have_ip) {
742 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
744 ctdb_addr_to_str(&vnn->public_address)));
745 return 0;
748 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
749 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
750 "and we have it on iface[%s], but it was assigned to node %d"
751 "and we are node %d, banning ourself\n",
752 ctdb_addr_to_str(&vnn->public_address),
753 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
754 ctdb_ban_self(ctdb);
755 return -1;
758 if (vnn->pnn == -1 && have_ip) {
759 vnn->pnn = ctdb->pnn;
760 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
761 "and we already have it on iface[%s], update local daemon\n",
762 ctdb_addr_to_str(&vnn->public_address),
763 ctdb_vnn_iface_string(vnn)));
764 return 0;
767 if (vnn->iface) {
768 if (vnn->iface != best_iface) {
769 if (!vnn->iface->link_up) {
770 do_updateip = true;
771 } else if (vnn->iface->references > (best_iface->references + 1)) {
772 /* only move when the rebalance gains something */
773 do_updateip = true;
778 if (!have_ip) {
779 if (do_updateip) {
780 ctdb_vnn_unassign_iface(ctdb, vnn);
781 do_updateip = false;
783 do_takeip = true;
786 if (do_takeip) {
787 ret = ctdb_do_takeip(ctdb, c, vnn);
788 if (ret != 0) {
789 return -1;
791 } else if (do_updateip) {
792 ret = ctdb_do_updateip(ctdb, c, vnn);
793 if (ret != 0) {
794 return -1;
796 } else {
798 * The interface is up and the kernel known the ip
799 * => do nothing
801 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
802 ctdb_addr_to_str(&pip->addr),
803 vnn->public_netmask_bits,
804 ctdb_vnn_iface_string(vnn)));
805 return 0;
808 /* tell ctdb_control.c that we will be replying asynchronously */
809 *async_reply = true;
811 return 0;
815 takeover an ip address old v4 style
817 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb,
818 struct ctdb_req_control *c,
819 TDB_DATA indata,
820 bool *async_reply)
822 TDB_DATA data;
824 data.dsize = sizeof(struct ctdb_public_ip);
825 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
826 CTDB_NO_MEMORY(ctdb, data.dptr);
828 memcpy(data.dptr, indata.dptr, indata.dsize);
829 return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
833 kill any clients that are registered with a IP that is being released
835 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
837 struct ctdb_client_ip *ip;
839 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
840 ctdb_addr_to_str(addr)));
842 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
843 ctdb_sock_addr tmp_addr;
845 tmp_addr = ip->addr;
846 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
847 ip->client_id,
848 ctdb_addr_to_str(&ip->addr)));
850 if (ctdb_same_ip(&tmp_addr, addr)) {
851 struct ctdb_client *client = ctdb_reqid_find(ctdb,
852 ip->client_id,
853 struct ctdb_client);
854 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
855 ip->client_id,
856 ctdb_addr_to_str(&ip->addr),
857 client->pid));
859 if (client->pid != 0) {
860 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
861 (unsigned)client->pid,
862 ctdb_addr_to_str(addr),
863 ip->client_id));
864 ctdb_kill(ctdb, client->pid, SIGKILL);
871 called when releaseip event finishes
873 static void release_ip_callback(struct ctdb_context *ctdb, int status,
874 void *private_data)
876 struct takeover_callback_state *state =
877 talloc_get_type(private_data, struct takeover_callback_state);
878 TDB_DATA data;
880 if (status == -ETIME) {
881 ctdb_ban_self(ctdb);
884 /* send a message to all clients of this node telling them
885 that the cluster has been reconfigured and they should
886 release any sockets on this IP */
887 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
888 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
889 data.dsize = strlen((char *)data.dptr)+1;
891 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
893 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
895 /* kill clients that have registered with this IP */
896 release_kill_clients(ctdb, state->addr);
898 ctdb_vnn_unassign_iface(ctdb, state->vnn);
900 /* the control succeeded */
901 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
902 talloc_free(state);
905 static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
907 state->vnn->update_in_flight = false;
908 return 0;
912 release an ip address
914 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
915 struct ctdb_req_control *c,
916 TDB_DATA indata,
917 bool *async_reply)
919 int ret;
920 struct takeover_callback_state *state;
921 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
922 struct ctdb_vnn *vnn;
923 char *iface;
925 /* update our vnn list */
926 vnn = find_public_ip_vnn(ctdb, &pip->addr);
927 if (vnn == NULL) {
928 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
929 ctdb_addr_to_str(&pip->addr)));
930 return 0;
932 vnn->pnn = pip->pnn;
934 /* stop any previous arps */
935 talloc_free(vnn->takeover_ctx);
936 vnn->takeover_ctx = NULL;
938 /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
939 * lazy multicast to drop an IP from any node that isn't the
940 * intended new node. The following causes makes ctdbd ignore
941 * a release for any address it doesn't host.
943 if (ctdb->do_checkpublicip) {
944 if (!ctdb_sys_have_ip(&pip->addr)) {
945 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
946 ctdb_addr_to_str(&pip->addr),
947 vnn->public_netmask_bits,
948 ctdb_vnn_iface_string(vnn)));
949 ctdb_vnn_unassign_iface(ctdb, vnn);
950 return 0;
952 } else {
953 if (vnn->iface == NULL) {
954 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
955 ctdb_addr_to_str(&pip->addr),
956 vnn->public_netmask_bits));
957 return 0;
961 /* There is a potential race between take_ip and us because we
962 * update the VNN via a callback that run when the
963 * eventscripts have been run. Avoid the race by allowing one
964 * update to be in flight at a time.
966 if (vnn->update_in_flight) {
967 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
968 "update for this IP already in flight\n",
969 ctdb_addr_to_str(&vnn->public_address),
970 vnn->public_netmask_bits));
971 return -1;
974 if (ctdb->do_checkpublicip) {
975 iface = ctdb_sys_find_ifname(&pip->addr);
976 if (iface == NULL) {
977 DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
978 return 0;
980 } else {
981 iface = strdup(ctdb_vnn_iface_string(vnn));
984 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
985 ctdb_addr_to_str(&pip->addr),
986 vnn->public_netmask_bits,
987 iface,
988 pip->pnn));
990 state = talloc(ctdb, struct takeover_callback_state);
991 CTDB_NO_MEMORY(ctdb, state);
993 state->c = talloc_steal(state, c);
994 state->addr = talloc(state, ctdb_sock_addr);
995 CTDB_NO_MEMORY(ctdb, state->addr);
996 *state->addr = pip->addr;
997 state->vnn = vnn;
999 vnn->update_in_flight = true;
1000 talloc_set_destructor(state, ctdb_releaseip_destructor);
1002 ret = ctdb_event_script_callback(ctdb,
1003 state, release_ip_callback, state,
1004 false,
1005 CTDB_EVENT_RELEASE_IP,
1006 "%s %s %u",
1007 iface,
1008 ctdb_addr_to_str(&pip->addr),
1009 vnn->public_netmask_bits);
1010 free(iface);
1011 if (ret != 0) {
1012 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1013 ctdb_addr_to_str(&pip->addr),
1014 ctdb_vnn_iface_string(vnn)));
1015 talloc_free(state);
1016 return -1;
1019 /* tell the control that we will be reply asynchronously */
1020 *async_reply = true;
1021 return 0;
1025 release an ip address old v4 style
1027 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb,
1028 struct ctdb_req_control *c,
1029 TDB_DATA indata,
1030 bool *async_reply)
1032 TDB_DATA data;
1034 data.dsize = sizeof(struct ctdb_public_ip);
1035 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
1036 CTDB_NO_MEMORY(ctdb, data.dptr);
1038 memcpy(data.dptr, indata.dptr, indata.dsize);
1039 return ctdb_control_release_ip(ctdb, c, data, async_reply);
1043 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1044 ctdb_sock_addr *addr,
1045 unsigned mask, const char *ifaces,
1046 bool check_address)
1048 struct ctdb_vnn *vnn;
1049 uint32_t num = 0;
1050 char *tmp;
1051 const char *iface;
1052 int i;
1053 int ret;
1055 tmp = strdup(ifaces);
1056 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1057 if (!ctdb_sys_check_iface_exists(iface)) {
1058 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1059 free(tmp);
1060 return -1;
1063 free(tmp);
1065 /* Verify that we dont have an entry for this ip yet */
1066 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1067 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1068 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1069 ctdb_addr_to_str(addr)));
1070 return -1;
1074 /* create a new vnn structure for this ip address */
1075 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1076 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1077 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1078 tmp = talloc_strdup(vnn, ifaces);
1079 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1080 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1081 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1082 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1083 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1084 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1085 num++;
1087 talloc_free(tmp);
1088 vnn->ifaces[num] = NULL;
1089 vnn->public_address = *addr;
1090 vnn->public_netmask_bits = mask;
1091 vnn->pnn = -1;
1092 if (check_address) {
1093 if (ctdb_sys_have_ip(addr)) {
1094 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1095 vnn->pnn = ctdb->pnn;
1099 for (i=0; vnn->ifaces[i]; i++) {
1100 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1101 if (ret != 0) {
1102 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1103 "for public_address[%s]\n",
1104 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1105 talloc_free(vnn);
1106 return -1;
1110 DLIST_ADD(ctdb->vnn, vnn);
1112 return 0;
1116 setup the event script directory
1118 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
1120 ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
1121 CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
1122 return 0;
1125 static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te,
1126 struct timeval t, void *private_data)
1128 struct ctdb_context *ctdb = talloc_get_type(private_data,
1129 struct ctdb_context);
1130 struct ctdb_vnn *vnn;
1132 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1133 int i;
1135 for (i=0; vnn->ifaces[i] != NULL; i++) {
1136 if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
1137 DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
1138 vnn->ifaces[i],
1139 ctdb_addr_to_str(&vnn->public_address)));
1144 event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx,
1145 timeval_current_ofs(30, 0),
1146 ctdb_check_interfaces_event, ctdb);
1150 int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
1152 if (ctdb->check_public_ifaces_ctx != NULL) {
1153 talloc_free(ctdb->check_public_ifaces_ctx);
1154 ctdb->check_public_ifaces_ctx = NULL;
1157 ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
1158 if (ctdb->check_public_ifaces_ctx == NULL) {
1159 ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
1162 event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx,
1163 timeval_current_ofs(30, 0),
1164 ctdb_check_interfaces_event, ctdb);
1166 return 0;
1171 setup the public address lists from a file
1173 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1175 char **lines;
1176 int nlines;
1177 int i;
1179 lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
1180 if (lines == NULL) {
1181 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1182 return -1;
1184 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1185 nlines--;
1188 for (i=0;i<nlines;i++) {
1189 unsigned mask;
1190 ctdb_sock_addr addr;
1191 const char *addrstr;
1192 const char *ifaces;
1193 char *tok, *line;
1195 line = lines[i];
1196 while ((*line == ' ') || (*line == '\t')) {
1197 line++;
1199 if (*line == '#') {
1200 continue;
1202 if (strcmp(line, "") == 0) {
1203 continue;
1205 tok = strtok(line, " \t");
1206 addrstr = tok;
1207 tok = strtok(NULL, " \t");
1208 if (tok == NULL) {
1209 if (NULL == ctdb->default_public_interface) {
1210 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1211 i+1));
1212 talloc_free(lines);
1213 return -1;
1215 ifaces = ctdb->default_public_interface;
1216 } else {
1217 ifaces = tok;
1220 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1221 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1222 talloc_free(lines);
1223 return -1;
1225 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1226 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1227 talloc_free(lines);
1228 return -1;
1233 talloc_free(lines);
1234 return 0;
1237 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1238 const char *iface,
1239 const char *ip)
1241 struct ctdb_vnn *svnn;
1242 struct ctdb_iface *cur = NULL;
1243 bool ok;
1244 int ret;
1246 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1247 CTDB_NO_MEMORY(ctdb, svnn);
1249 svnn->ifaces = talloc_array(svnn, const char *, 2);
1250 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1251 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1252 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1253 svnn->ifaces[1] = NULL;
1255 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1256 if (!ok) {
1257 talloc_free(svnn);
1258 return -1;
1261 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1262 if (ret != 0) {
1263 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1264 "for single_ip[%s]\n",
1265 svnn->ifaces[0],
1266 ctdb_addr_to_str(&svnn->public_address)));
1267 talloc_free(svnn);
1268 return -1;
1271 /* assume the single public ip interface is initially "good" */
1272 cur = ctdb_find_iface(ctdb, iface);
1273 if (cur == NULL) {
1274 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1275 return -1;
1277 cur->link_up = true;
1279 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1280 if (ret != 0) {
1281 talloc_free(svnn);
1282 return -1;
1285 ctdb->single_ip_vnn = svnn;
1286 return 0;
1289 /* Given a physical node, return the number of
1290 public addresses that is currently assigned to this node.
1292 static int node_ip_coverage(struct ctdb_context *ctdb,
1293 int32_t pnn,
1294 struct ctdb_public_ip_list *ips)
1296 int num=0;
1298 for (;ips;ips=ips->next) {
1299 if (ips->pnn == pnn) {
1300 num++;
1303 return num;
1307 /* Can the given node host the given IP: is the public IP known to the
1308 * node and is NOIPHOST unset?
1310 static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn,
1311 struct ctdb_ipflags ipflags,
1312 struct ctdb_public_ip_list *ip)
1314 struct ctdb_all_public_ips *public_ips;
1315 int i;
1317 if (ipflags.noiphost) {
1318 return false;
1321 public_ips = ctdb->nodes[pnn]->available_public_ips;
1323 if (public_ips == NULL) {
1324 return false;
1327 for (i=0; i<public_ips->num; i++) {
1328 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
1329 /* yes, this node can serve this public ip */
1330 return true;
1334 return false;
1337 static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn,
1338 struct ctdb_ipflags ipflags,
1339 struct ctdb_public_ip_list *ip)
1341 if (ipflags.noiptakeover) {
1342 return false;
1345 return can_node_host_ip(ctdb, pnn, ipflags, ip);
1348 /* search the node lists list for a node to takeover this ip.
1349 pick the node that currently are serving the least number of ips
1350 so that the ips get spread out evenly.
1352 static int find_takeover_node(struct ctdb_context *ctdb,
1353 struct ctdb_ipflags *ipflags,
1354 struct ctdb_public_ip_list *ip,
1355 struct ctdb_public_ip_list *all_ips)
1357 int pnn, min=0, num;
1358 int i, numnodes;
1360 numnodes = talloc_array_length(ipflags);
1361 pnn = -1;
1362 for (i=0; i<numnodes; i++) {
1363 /* verify that this node can serve this ip */
1364 if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
1365 /* no it couldnt so skip to the next node */
1366 continue;
1369 num = node_ip_coverage(ctdb, i, all_ips);
1370 /* was this the first node we checked ? */
1371 if (pnn == -1) {
1372 pnn = i;
1373 min = num;
1374 } else {
1375 if (num < min) {
1376 pnn = i;
1377 min = num;
1381 if (pnn == -1) {
1382 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
1383 ctdb_addr_to_str(&ip->addr)));
1385 return -1;
1388 ip->pnn = pnn;
1389 return 0;
1392 #define IP_KEYLEN 4
1393 static uint32_t *ip_key(ctdb_sock_addr *ip)
1395 static uint32_t key[IP_KEYLEN];
1397 bzero(key, sizeof(key));
1399 switch (ip->sa.sa_family) {
1400 case AF_INET:
1401 key[3] = htonl(ip->ip.sin_addr.s_addr);
1402 break;
1403 case AF_INET6: {
1404 uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
1405 key[0] = htonl(s6_a32[0]);
1406 key[1] = htonl(s6_a32[1]);
1407 key[2] = htonl(s6_a32[2]);
1408 key[3] = htonl(s6_a32[3]);
1409 break;
1411 default:
1412 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
1413 return key;
1416 return key;
1419 static void *add_ip_callback(void *parm, void *data)
1421 struct ctdb_public_ip_list *this_ip = parm;
1422 struct ctdb_public_ip_list *prev_ip = data;
1424 if (prev_ip == NULL) {
1425 return parm;
1427 if (this_ip->pnn == -1) {
1428 this_ip->pnn = prev_ip->pnn;
1431 return parm;
1434 static int getips_count_callback(void *param, void *data)
1436 struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
1437 struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
1439 new_ip->next = *ip_list;
1440 *ip_list = new_ip;
1441 return 0;
1444 static struct ctdb_public_ip_list *
1445 create_merged_ip_list(struct ctdb_context *ctdb)
1447 int i, j;
1448 struct ctdb_public_ip_list *ip_list;
1449 struct ctdb_all_public_ips *public_ips;
1451 if (ctdb->ip_tree != NULL) {
1452 talloc_free(ctdb->ip_tree);
1453 ctdb->ip_tree = NULL;
1455 ctdb->ip_tree = trbt_create(ctdb, 0);
1457 for (i=0;i<ctdb->num_nodes;i++) {
1458 public_ips = ctdb->nodes[i]->known_public_ips;
1460 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1461 continue;
1464 /* there were no public ips for this node */
1465 if (public_ips == NULL) {
1466 continue;
1469 for (j=0;j<public_ips->num;j++) {
1470 struct ctdb_public_ip_list *tmp_ip;
1472 tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
1473 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1474 /* Do not use information about IP addresses hosted
1475 * on other nodes, it may not be accurate */
1476 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1477 tmp_ip->pnn = public_ips->ips[j].pnn;
1478 } else {
1479 tmp_ip->pnn = -1;
1481 tmp_ip->addr = public_ips->ips[j].addr;
1482 tmp_ip->next = NULL;
1484 trbt_insertarray32_callback(ctdb->ip_tree,
1485 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1486 add_ip_callback,
1487 tmp_ip);
1491 ip_list = NULL;
1492 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1494 return ip_list;
1498 * This is the length of the longtest common prefix between the IPs.
1499 * It is calculated by XOR-ing the 2 IPs together and counting the
1500 * number of leading zeroes. The implementation means that all
1501 * addresses end up being 128 bits long.
1503 * FIXME? Should we consider IPv4 and IPv6 separately given that the
1504 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
1505 * lots of nodes and IP addresses?
1507 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
1509 uint32_t ip1_k[IP_KEYLEN];
1510 uint32_t *t;
1511 int i;
1512 uint32_t x;
1514 uint32_t distance = 0;
1516 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
1517 t = ip_key(ip2);
1518 for (i=0; i<IP_KEYLEN; i++) {
1519 x = ip1_k[i] ^ t[i];
1520 if (x == 0) {
1521 distance += 32;
1522 } else {
1523 /* Count number of leading zeroes.
1524 * FIXME? This could be optimised...
1526 while ((x & (1 << 31)) == 0) {
1527 x <<= 1;
1528 distance += 1;
1533 return distance;
1536 /* Calculate the IP distance for the given IP relative to IPs on the
1537 given node. The ips argument is generally the all_ips variable
1538 used in the main part of the algorithm.
1540 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
1541 struct ctdb_public_ip_list *ips,
1542 int pnn)
1544 struct ctdb_public_ip_list *t;
1545 uint32_t d;
1547 uint32_t sum = 0;
1549 for (t=ips; t != NULL; t=t->next) {
1550 if (t->pnn != pnn) {
1551 continue;
1554 /* Optimisation: We never calculate the distance
1555 * between an address and itself. This allows us to
1556 * calculate the effect of removing an address from a
1557 * node by simply calculating the distance between
1558 * that address and all of the exitsing addresses.
1559 * Moreover, we assume that we're only ever dealing
1560 * with addresses from all_ips so we can identify an
1561 * address via a pointer rather than doing a more
1562 * expensive address comparison. */
1563 if (&(t->addr) == ip) {
1564 continue;
1567 d = ip_distance(ip, &(t->addr));
1568 sum += d * d; /* Cheaper than pulling in math.h :-) */
1571 return sum;
1574 /* Return the LCP2 imbalance metric for addresses currently assigned
1575 to the given node.
1577 static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
1579 struct ctdb_public_ip_list *t;
1581 uint32_t imbalance = 0;
1583 for (t=all_ips; t!=NULL; t=t->next) {
1584 if (t->pnn != pnn) {
1585 continue;
1587 /* Pass the rest of the IPs rather than the whole
1588 all_ips input list.
1590 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
1593 return imbalance;
1596 /* Allocate any unassigned IPs just by looping through the IPs and
1597 * finding the best node for each.
1599 static void basic_allocate_unassigned(struct ctdb_context *ctdb,
1600 struct ctdb_ipflags *ipflags,
1601 struct ctdb_public_ip_list *all_ips)
1603 struct ctdb_public_ip_list *tmp_ip;
1605 /* loop over all ip's and find a physical node to cover for
1606 each unassigned ip.
1608 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1609 if (tmp_ip->pnn == -1) {
1610 if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
1611 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1612 ctdb_addr_to_str(&tmp_ip->addr)));
1618 /* Basic non-deterministic rebalancing algorithm.
1620 static void basic_failback(struct ctdb_context *ctdb,
1621 struct ctdb_ipflags *ipflags,
1622 struct ctdb_public_ip_list *all_ips,
1623 int num_ips)
1625 int i, numnodes;
1626 int maxnode, maxnum, minnode, minnum, num, retries;
1627 struct ctdb_public_ip_list *tmp_ip;
1629 numnodes = talloc_array_length(ipflags);
1630 retries = 0;
1632 try_again:
1633 maxnum=0;
1634 minnum=0;
1636 /* for each ip address, loop over all nodes that can serve
1637 this ip and make sure that the difference between the node
1638 serving the most and the node serving the least ip's are
1639 not greater than 1.
1641 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1642 if (tmp_ip->pnn == -1) {
1643 continue;
1646 /* Get the highest and lowest number of ips's served by any
1647 valid node which can serve this ip.
1649 maxnode = -1;
1650 minnode = -1;
1651 for (i=0; i<numnodes; i++) {
1652 /* only check nodes that can actually serve this ip */
1653 if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
1654 /* no it couldnt so skip to the next node */
1655 continue;
1658 num = node_ip_coverage(ctdb, i, all_ips);
1659 if (maxnode == -1) {
1660 maxnode = i;
1661 maxnum = num;
1662 } else {
1663 if (num > maxnum) {
1664 maxnode = i;
1665 maxnum = num;
1668 if (minnode == -1) {
1669 minnode = i;
1670 minnum = num;
1671 } else {
1672 if (num < minnum) {
1673 minnode = i;
1674 minnum = num;
1678 if (maxnode == -1) {
1679 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1680 ctdb_addr_to_str(&tmp_ip->addr)));
1682 continue;
1685 /* if the spread between the smallest and largest coverage by
1686 a node is >=2 we steal one of the ips from the node with
1687 most coverage to even things out a bit.
1688 try to do this a limited number of times since we dont
1689 want to spend too much time balancing the ip coverage.
1691 if ( (maxnum > minnum+1)
1692 && (retries < (num_ips + 5)) ){
1693 struct ctdb_public_ip_list *tmp;
1695 /* Reassign one of maxnode's VNNs */
1696 for (tmp=all_ips;tmp;tmp=tmp->next) {
1697 if (tmp->pnn == maxnode) {
1698 (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
1699 retries++;
1700 goto try_again;;
1707 struct ctdb_rebalancenodes {
1708 struct ctdb_rebalancenodes *next;
1709 uint32_t pnn;
1711 static struct ctdb_rebalancenodes *force_rebalance_list = NULL;
1714 /* set this flag to force the node to be rebalanced even if it just didnt
1715 become healthy again.
1717 void lcp2_forcerebalance(struct ctdb_context *ctdb, uint32_t pnn)
1719 struct ctdb_rebalancenodes *rebalance;
1721 for (rebalance = force_rebalance_list; rebalance; rebalance = rebalance->next) {
1722 if (rebalance->pnn == pnn) {
1723 return;
1727 rebalance = talloc(ctdb, struct ctdb_rebalancenodes);
1728 rebalance->pnn = pnn;
1729 rebalance->next = force_rebalance_list;
1730 force_rebalance_list = rebalance;
1733 /* Do necessary LCP2 initialisation. Bury it in a function here so
1734 * that we can unit test it.
1736 static void lcp2_init(struct ctdb_context *tmp_ctx,
1737 struct ctdb_ipflags *ipflags,
1738 struct ctdb_public_ip_list *all_ips,
1739 uint32_t **lcp2_imbalances,
1740 bool **rebalance_candidates)
1742 int i, numnodes;
1743 struct ctdb_public_ip_list *tmp_ip;
1745 numnodes = talloc_array_length(ipflags);
1747 *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
1748 CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
1749 *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
1750 CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
1752 for (i=0; i<numnodes; i++) {
1753 (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
1754 /* First step: assume all nodes are candidates */
1755 (*rebalance_candidates)[i] = true;
1758 /* 2nd step: if a node has IPs assigned then it must have been
1759 * healthy before, so we remove it from consideration. This
1760 * is overkill but is all we have because we don't maintain
1761 * state between takeover runs. An alternative would be to
1762 * keep state and invalidate it every time the recovery master
1763 * changes.
1765 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1766 if (tmp_ip->pnn != -1) {
1767 (*rebalance_candidates)[tmp_ip->pnn] = false;
1771 /* 3rd step: if a node is forced to re-balance then
1772 we allow failback onto the node */
1773 while (force_rebalance_list != NULL) {
1774 struct ctdb_rebalancenodes *next = force_rebalance_list->next;
1776 if (force_rebalance_list->pnn <= numnodes) {
1777 (*rebalance_candidates)[force_rebalance_list->pnn] = true;
1780 DEBUG(DEBUG_ERR,("During ipreallocation, forced rebalance of node %d\n", force_rebalance_list->pnn));
1781 talloc_free(force_rebalance_list);
1782 force_rebalance_list = next;
1786 /* Allocate any unassigned addresses using the LCP2 algorithm to find
1787 * the IP/node combination that will cost the least.
1789 static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
1790 struct ctdb_ipflags *ipflags,
1791 struct ctdb_public_ip_list *all_ips,
1792 uint32_t *lcp2_imbalances)
1794 struct ctdb_public_ip_list *tmp_ip;
1795 int dstnode, numnodes;
1797 int minnode;
1798 uint32_t mindsum, dstdsum, dstimbl, minimbl;
1799 struct ctdb_public_ip_list *minip;
1801 bool should_loop = true;
1802 bool have_unassigned = true;
1804 numnodes = talloc_array_length(ipflags);
1806 while (have_unassigned && should_loop) {
1807 should_loop = false;
1809 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1810 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
1812 minnode = -1;
1813 mindsum = 0;
1814 minip = NULL;
1816 /* loop over each unassigned ip. */
1817 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1818 if (tmp_ip->pnn != -1) {
1819 continue;
1822 for (dstnode=0; dstnode<numnodes; dstnode++) {
1823 /* only check nodes that can actually takeover this ip */
1824 if (!can_node_takeover_ip(ctdb, dstnode,
1825 ipflags[dstnode],
1826 tmp_ip)) {
1827 /* no it couldnt so skip to the next node */
1828 continue;
1831 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1832 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1833 DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
1834 ctdb_addr_to_str(&(tmp_ip->addr)),
1835 dstnode,
1836 dstimbl - lcp2_imbalances[dstnode]));
1839 if ((minnode == -1) || (dstdsum < mindsum)) {
1840 minnode = dstnode;
1841 minimbl = dstimbl;
1842 mindsum = dstdsum;
1843 minip = tmp_ip;
1844 should_loop = true;
1849 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1851 /* If we found one then assign it to the given node. */
1852 if (minnode != -1) {
1853 minip->pnn = minnode;
1854 lcp2_imbalances[minnode] = minimbl;
1855 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
1856 ctdb_addr_to_str(&(minip->addr)),
1857 minnode,
1858 mindsum));
1861 /* There might be a better way but at least this is clear. */
1862 have_unassigned = false;
1863 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1864 if (tmp_ip->pnn == -1) {
1865 have_unassigned = true;
1870 /* We know if we have an unassigned addresses so we might as
1871 * well optimise.
1873 if (have_unassigned) {
1874 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1875 if (tmp_ip->pnn == -1) {
1876 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1877 ctdb_addr_to_str(&tmp_ip->addr)));
1883 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
1884 * to move IPs from, determines the best IP/destination node
1885 * combination to move from the source node.
1887 static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
1888 struct ctdb_ipflags *ipflags,
1889 struct ctdb_public_ip_list *all_ips,
1890 int srcnode,
1891 uint32_t candimbl,
1892 uint32_t *lcp2_imbalances,
1893 bool *rebalance_candidates)
1895 int dstnode, mindstnode, numnodes;
1896 uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
1897 uint32_t minsrcimbl, mindstimbl;
1898 struct ctdb_public_ip_list *minip;
1899 struct ctdb_public_ip_list *tmp_ip;
1901 /* Find an IP and destination node that best reduces imbalance. */
1902 minip = NULL;
1903 minsrcimbl = 0;
1904 mindstnode = -1;
1905 mindstimbl = 0;
1907 numnodes = talloc_array_length(ipflags);
1909 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1910 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
1912 for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
1913 /* Only consider addresses on srcnode. */
1914 if (tmp_ip->pnn != srcnode) {
1915 continue;
1918 /* What is this IP address costing the source node? */
1919 srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
1920 srcimbl = candimbl - srcdsum;
1922 /* Consider this IP address would cost each potential
1923 * destination node. Destination nodes are limited to
1924 * those that are newly healthy, since we don't want
1925 * to do gratuitous failover of IPs just to make minor
1926 * balance improvements.
1928 for (dstnode=0; dstnode<numnodes; dstnode++) {
1929 if (!rebalance_candidates[dstnode]) {
1930 continue;
1933 /* only check nodes that can actually takeover this ip */
1934 if (!can_node_takeover_ip(ctdb, dstnode,
1935 ipflags[dstnode], tmp_ip)) {
1936 /* no it couldnt so skip to the next node */
1937 continue;
1940 dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
1941 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
1942 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
1943 srcnode, srcimbl - lcp2_imbalances[srcnode],
1944 ctdb_addr_to_str(&(tmp_ip->addr)),
1945 dstnode, dstimbl - lcp2_imbalances[dstnode]));
1947 if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
1948 ((mindstnode == -1) || \
1949 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
1951 minip = tmp_ip;
1952 minsrcimbl = srcimbl;
1953 mindstnode = dstnode;
1954 mindstimbl = dstimbl;
1958 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
1960 if (mindstnode != -1) {
1961 /* We found a move that makes things better... */
1962 DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
1963 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
1964 ctdb_addr_to_str(&(minip->addr)),
1965 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
1968 lcp2_imbalances[srcnode] = srcimbl;
1969 lcp2_imbalances[mindstnode] = mindstimbl;
1970 minip->pnn = mindstnode;
1972 return true;
1975 return false;
1979 struct lcp2_imbalance_pnn {
1980 uint32_t imbalance;
1981 int pnn;
1984 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
1986 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
1987 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
1989 if (lipa->imbalance > lipb->imbalance) {
1990 return -1;
1991 } else if (lipa->imbalance == lipb->imbalance) {
1992 return 0;
1993 } else {
1994 return 1;
1998 /* LCP2 algorithm for rebalancing the cluster. This finds the source
1999 * node with the highest LCP2 imbalance, and then determines the best
2000 * IP/destination node combination to move from the source node.
2002 static void lcp2_failback(struct ctdb_context *ctdb,
2003 struct ctdb_ipflags *ipflags,
2004 struct ctdb_public_ip_list *all_ips,
2005 uint32_t *lcp2_imbalances,
2006 bool *rebalance_candidates)
2008 int i, num_rebalance_candidates, numnodes;
2009 struct lcp2_imbalance_pnn * lips;
2010 bool again;
2012 numnodes = talloc_array_length(ipflags);
2014 try_again:
2016 /* It is only worth continuing if we have suitable target
2017 * nodes to transfer IPs to. This check is much cheaper than
2018 * continuing on...
2020 num_rebalance_candidates = 0;
2021 for (i=0; i<numnodes; i++) {
2022 if (rebalance_candidates[i]) {
2023 num_rebalance_candidates++;
2026 if (num_rebalance_candidates == 0) {
2027 return;
2030 /* Put the imbalances and nodes into an array, sort them and
2031 * iterate through candidates. Usually the 1st one will be
2032 * used, so this doesn't cost much...
2034 lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
2035 for (i=0; i<numnodes; i++) {
2036 lips[i].imbalance = lcp2_imbalances[i];
2037 lips[i].pnn = i;
2039 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
2040 lcp2_cmp_imbalance_pnn);
2042 again = false;
2043 for (i=0; i<numnodes; i++) {
2044 /* This means that all nodes had 0 or 1 addresses, so
2045 * can't be imbalanced.
2047 if (lips[i].imbalance == 0) {
2048 break;
2051 if (lcp2_failback_candidate(ctdb,
2052 ipflags,
2053 all_ips,
2054 lips[i].pnn,
2055 lips[i].imbalance,
2056 lcp2_imbalances,
2057 rebalance_candidates)) {
2058 again = true;
2059 break;
2063 talloc_free(lips);
2064 if (again) {
2065 goto try_again;
2069 static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
2070 struct ctdb_ipflags *ipflags,
2071 struct ctdb_public_ip_list *all_ips)
2073 struct ctdb_public_ip_list *tmp_ip;
2075 /* verify that the assigned nodes can serve that public ip
2076 and set it to -1 if not
2078 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2079 if (tmp_ip->pnn == -1) {
2080 continue;
2082 if (!can_node_host_ip(ctdb, tmp_ip->pnn,
2083 ipflags[tmp_ip->pnn], tmp_ip) != 0) {
2084 /* this node can not serve this ip. */
2085 DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
2086 ctdb_addr_to_str(&(tmp_ip->addr)),
2087 tmp_ip->pnn));
2088 tmp_ip->pnn = -1;
2093 static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
2094 struct ctdb_ipflags *ipflags,
2095 struct ctdb_public_ip_list *all_ips)
2097 struct ctdb_public_ip_list *tmp_ip;
2098 int i, numnodes;
2100 numnodes = talloc_array_length(ipflags);
2102 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
2103 /* Allocate IPs to nodes in a modulo fashion so that IPs will
2104 * always be allocated the same way for a specific set of
2105 * available/unavailable nodes.
2108 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
2109 tmp_ip->pnn = i % numnodes;
2112 /* IP failback doesn't make sense with deterministic
2113 * IPs, since the modulo step above implicitly fails
2114 * back IPs to their "home" node.
2116 if (1 == ctdb->tunable.no_ip_failback) {
2117 DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
2120 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2122 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2124 /* No failback here! */
2127 static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
2128 struct ctdb_ipflags *ipflags,
2129 struct ctdb_public_ip_list *all_ips)
2131 /* This should be pushed down into basic_failback. */
2132 struct ctdb_public_ip_list *tmp_ip;
2133 int num_ips = 0;
2134 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2135 num_ips++;
2138 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2140 basic_allocate_unassigned(ctdb, ipflags, all_ips);
2142 /* If we don't want IPs to fail back then don't rebalance IPs. */
2143 if (1 == ctdb->tunable.no_ip_failback) {
2144 return;
2147 /* Now, try to make sure the ip adresses are evenly distributed
2148 across the nodes.
2150 basic_failback(ctdb, ipflags, all_ips, num_ips);
2153 static void ip_alloc_lcp2(struct ctdb_context *ctdb,
2154 struct ctdb_ipflags *ipflags,
2155 struct ctdb_public_ip_list *all_ips)
2157 uint32_t *lcp2_imbalances;
2158 bool *rebalance_candidates;
2160 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2162 unassign_unsuitable_ips(ctdb, ipflags, all_ips);
2164 lcp2_init(tmp_ctx, ipflags, all_ips,
2165 &lcp2_imbalances, &rebalance_candidates);
2167 lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
2169 /* If we don't want IPs to fail back then don't rebalance IPs. */
2170 if (1 == ctdb->tunable.no_ip_failback) {
2171 goto finished;
2174 /* Now, try to make sure the ip adresses are evenly distributed
2175 across the nodes.
2177 lcp2_failback(ctdb, ipflags, all_ips,
2178 lcp2_imbalances, rebalance_candidates);
2180 finished:
2181 talloc_free(tmp_ctx);
2184 static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
2186 int i, num_healthy;
2188 /* Count how many completely healthy nodes we have */
2189 num_healthy = 0;
2190 for (i=0;i<nodemap->num;i++) {
2191 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
2192 num_healthy++;
2196 return num_healthy == 0;
2199 /* The calculation part of the IP allocation algorithm. */
2200 static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
2201 struct ctdb_ipflags *ipflags,
2202 struct ctdb_public_ip_list **all_ips_p)
2204 /* since nodes only know about those public addresses that
2205 can be served by that particular node, no single node has
2206 a full list of all public addresses that exist in the cluster.
2207 Walk over all node structures and create a merged list of
2208 all public addresses that exist in the cluster.
2210 keep the tree of ips around as ctdb->ip_tree
2212 *all_ips_p = create_merged_ip_list(ctdb);
2214 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
2215 ip_alloc_lcp2(ctdb, ipflags, *all_ips_p);
2216 } else if (1 == ctdb->tunable.deterministic_public_ips) {
2217 ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
2218 } else {
2219 ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
2222 /* at this point ->pnn is the node which will own each IP
2223 or -1 if there is no node that can cover this ip
2226 return;
2229 struct get_tunable_callback_data {
2230 const char *tunable;
2231 uint32_t *out;
2232 bool fatal;
2235 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
2236 int32_t res, TDB_DATA outdata,
2237 void *callback)
2239 struct get_tunable_callback_data *cd =
2240 (struct get_tunable_callback_data *)callback;
2241 int size;
2243 if (res != 0) {
2244 /* Already handled in fail callback */
2245 return;
2248 if (outdata.dsize != sizeof(uint32_t)) {
2249 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
2250 cd->tunable, pnn, (int)sizeof(uint32_t),
2251 (int)outdata.dsize));
2252 cd->fatal = true;
2253 return;
2256 size = talloc_array_length(cd->out);
2257 if (pnn >= size) {
2258 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
2259 cd->tunable, pnn, size));
2260 return;
2264 cd->out[pnn] = *(uint32_t *)outdata.dptr;
2267 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2268 int32_t res, TDB_DATA outdata,
2269 void *callback)
2271 struct get_tunable_callback_data *cd =
2272 (struct get_tunable_callback_data *)callback;
2274 switch (res) {
2275 case -ETIME:
2276 DEBUG(DEBUG_ERR,
2277 ("Timed out getting tunable \"%s\" from node %d\n",
2278 cd->tunable, pnn));
2279 cd->fatal = true;
2280 break;
2281 case -EINVAL:
2282 case -1:
2283 DEBUG(DEBUG_WARNING,
2284 ("Tunable \"%s\" not implemented on node %d\n",
2285 cd->tunable, pnn));
2286 break;
2287 default:
2288 DEBUG(DEBUG_ERR,
2289 ("Unexpected error getting tunable \"%s\" from node %d\n",
2290 cd->tunable, pnn));
2291 cd->fatal = true;
2295 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
2296 TALLOC_CTX *tmp_ctx,
2297 struct ctdb_node_map *nodemap,
2298 const char *tunable,
2299 uint32_t default_value)
2301 TDB_DATA data;
2302 struct ctdb_control_get_tunable *t;
2303 uint32_t *nodes;
2304 uint32_t *tvals;
2305 struct get_tunable_callback_data callback_data;
2306 int i;
2308 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
2309 CTDB_NO_MEMORY_NULL(ctdb, tvals);
2310 for (i=0; i<nodemap->num; i++) {
2311 tvals[i] = default_value;
2314 callback_data.out = tvals;
2315 callback_data.tunable = tunable;
2316 callback_data.fatal = false;
2318 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
2319 data.dptr = talloc_size(tmp_ctx, data.dsize);
2320 t = (struct ctdb_control_get_tunable *)data.dptr;
2321 t->length = strlen(tunable)+1;
2322 memcpy(t->name, tunable, t->length);
2323 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2324 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
2325 nodes, 0, TAKEOVER_TIMEOUT(),
2326 false, data,
2327 get_tunable_callback,
2328 get_tunable_fail_callback,
2329 &callback_data) != 0) {
2330 if (callback_data.fatal) {
2331 talloc_free(tvals);
2332 tvals = NULL;
2335 talloc_free(nodes);
2336 talloc_free(data.dptr);
2338 return tvals;
2341 struct get_runstate_callback_data {
2342 enum ctdb_runstate *out;
2343 bool fatal;
2346 static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
2347 int32_t res, TDB_DATA outdata,
2348 void *callback_data)
2350 struct get_runstate_callback_data *cd =
2351 (struct get_runstate_callback_data *)callback_data;
2352 int size;
2354 if (res != 0) {
2355 /* Already handled in fail callback */
2356 return;
2359 if (outdata.dsize != sizeof(uint32_t)) {
2360 DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
2361 pnn, (int)sizeof(uint32_t),
2362 (int)outdata.dsize));
2363 cd->fatal = true;
2364 return;
2367 size = talloc_array_length(cd->out);
2368 if (pnn >= size) {
2369 DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
2370 pnn, size));
2371 return;
2374 cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
2377 static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
2378 int32_t res, TDB_DATA outdata,
2379 void *callback)
2381 struct get_runstate_callback_data *cd =
2382 (struct get_runstate_callback_data *)callback;
2384 switch (res) {
2385 case -ETIME:
2386 DEBUG(DEBUG_ERR,
2387 ("Timed out getting runstate from node %d\n", pnn));
2388 cd->fatal = true;
2389 break;
2390 default:
2391 DEBUG(DEBUG_WARNING,
2392 ("Error getting runstate from node %d - assuming runstates not supported\n",
2393 pnn));
2397 static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
2398 TALLOC_CTX *tmp_ctx,
2399 struct ctdb_node_map *nodemap,
2400 enum ctdb_runstate default_value)
2402 uint32_t *nodes;
2403 enum ctdb_runstate *rs;
2404 struct get_runstate_callback_data callback_data;
2405 int i;
2407 rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
2408 CTDB_NO_MEMORY_NULL(ctdb, rs);
2409 for (i=0; i<nodemap->num; i++) {
2410 rs[i] = default_value;
2413 callback_data.out = rs;
2414 callback_data.fatal = false;
2416 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2417 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
2418 nodes, 0, TAKEOVER_TIMEOUT(),
2419 true, tdb_null,
2420 get_runstate_callback,
2421 get_runstate_fail_callback,
2422 &callback_data) != 0) {
2423 if (callback_data.fatal) {
2424 free(rs);
2425 rs = NULL;
2428 talloc_free(nodes);
2430 return rs;
2433 /* Set internal flags for IP allocation:
2434 * Clear ip flags
2435 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
2436 * Set NOIPHOST ip flag for each INACTIVE node
2437 * if all nodes are disabled:
2438 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
2439 * else
2440 * Set NOIPHOST ip flags for disabled nodes
2442 static struct ctdb_ipflags *
2443 set_ipflags_internal(struct ctdb_context *ctdb,
2444 TALLOC_CTX *tmp_ctx,
2445 struct ctdb_node_map *nodemap,
2446 uint32_t *tval_noiptakeover,
2447 uint32_t *tval_noiphostonalldisabled,
2448 enum ctdb_runstate *runstate)
2450 int i;
2451 struct ctdb_ipflags *ipflags;
2453 /* Clear IP flags - implicit due to talloc_zero */
2454 ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
2455 CTDB_NO_MEMORY_NULL(ctdb, ipflags);
2457 for (i=0;i<nodemap->num;i++) {
2458 /* Can not take IPs on node with NoIPTakeover set */
2459 if (tval_noiptakeover[i] != 0) {
2460 ipflags[i].noiptakeover = true;
2463 /* Can not host IPs on node not in RUNNING state */
2464 if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
2465 ipflags[i].noiphost = true;
2466 continue;
2468 /* Can not host IPs on INACTIVE node */
2469 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2470 ipflags[i].noiphost = true;
2474 if (all_nodes_are_disabled(nodemap)) {
2475 /* If all nodes are disabled, can not host IPs on node
2476 * with NoIPHostOnAllDisabled set
2478 for (i=0;i<nodemap->num;i++) {
2479 if (tval_noiphostonalldisabled[i] != 0) {
2480 ipflags[i].noiphost = true;
2483 } else {
2484 /* If some nodes are not disabled, then can not host
2485 * IPs on DISABLED node
2487 for (i=0;i<nodemap->num;i++) {
2488 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
2489 ipflags[i].noiphost = true;
2494 return ipflags;
2497 static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
2498 TALLOC_CTX *tmp_ctx,
2499 struct ctdb_node_map *nodemap)
2501 uint32_t *tval_noiptakeover;
2502 uint32_t *tval_noiphostonalldisabled;
2503 struct ctdb_ipflags *ipflags;
2504 enum ctdb_runstate *runstate;
2507 tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2508 "NoIPTakeover", 0);
2509 if (tval_noiptakeover == NULL) {
2510 return NULL;
2513 tval_noiphostonalldisabled =
2514 get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
2515 "NoIPHostOnAllDisabled", 0);
2516 if (tval_noiphostonalldisabled == NULL) {
2517 /* Caller frees tmp_ctx */
2518 return NULL;
2521 /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
2522 * will default to CTDB_RUNSTATE_RUNNING. This ensures
2523 * reasonable behaviour on a mixed cluster during upgrade.
2525 runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
2526 CTDB_RUNSTATE_RUNNING);
2527 if (runstate == NULL) {
2528 /* Caller frees tmp_ctx */
2529 return NULL;
2532 ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
2533 tval_noiptakeover,
2534 tval_noiphostonalldisabled,
2535 runstate);
2537 talloc_free(tval_noiptakeover);
2538 talloc_free(tval_noiphostonalldisabled);
2539 talloc_free(runstate);
2541 return ipflags;
2545 make any IP alias changes for public addresses that are necessary
2547 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
2548 client_async_callback fail_callback, void *callback_data)
2550 int i;
2551 struct ctdb_public_ip ip;
2552 struct ctdb_public_ipv4 ipv4;
2553 uint32_t *nodes;
2554 struct ctdb_public_ip_list *all_ips, *tmp_ip;
2555 TDB_DATA data;
2556 struct timeval timeout;
2557 struct client_async_data *async_data;
2558 struct ctdb_client_control_state *state;
2559 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2560 uint32_t disable_timeout;
2561 struct ctdb_ipflags *ipflags;
2564 * ip failover is completely disabled, just send out the
2565 * ipreallocated event.
2567 if (ctdb->tunable.disable_ip_failover != 0) {
2568 goto ipreallocated;
2571 ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
2572 if (ipflags == NULL) {
2573 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
2574 talloc_free(tmp_ctx);
2575 return -1;
2578 ZERO_STRUCT(ip);
2580 /* Do the IP reassignment calculations */
2581 ctdb_takeover_run_core(ctdb, ipflags, &all_ips);
2583 /* The IP flags need to be cleared because they should never
2584 * be seen outside the IP allocation code.
2587 /* The recovery daemon does regular sanity checks of the IPs.
2588 * However, sometimes it is overzealous and thinks changes are
2589 * required when they're already underway. This stops the
2590 * checks for a while before we start moving IPs.
2592 disable_timeout = ctdb->tunable.takeover_timeout;
2593 data.dptr = (uint8_t*)&disable_timeout;
2594 data.dsize = sizeof(disable_timeout);
2595 if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2596 CTDB_SRVID_DISABLE_IP_CHECK, data) != 0) {
2597 DEBUG(DEBUG_INFO,("Failed to disable ip verification\n"));
2600 /* now tell all nodes to delete any alias that they should not
2601 have. This will be a NOOP on nodes that don't currently
2602 hold the given alias */
2603 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2604 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2606 async_data->fail_callback = fail_callback;
2607 async_data->callback_data = callback_data;
2609 for (i=0;i<nodemap->num;i++) {
2610 /* don't talk to unconnected nodes, but do talk to banned nodes */
2611 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2612 continue;
2615 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2616 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
2617 /* This node should be serving this
2618 vnn so dont tell it to release the ip
2620 continue;
2622 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2623 ipv4.pnn = tmp_ip->pnn;
2624 ipv4.sin = tmp_ip->addr.ip;
2626 timeout = TAKEOVER_TIMEOUT();
2627 data.dsize = sizeof(ipv4);
2628 data.dptr = (uint8_t *)&ipv4;
2629 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2630 0, CTDB_CONTROL_RELEASE_IPv4, 0,
2631 data, async_data,
2632 &timeout, NULL);
2633 } else {
2634 ip.pnn = tmp_ip->pnn;
2635 ip.addr = tmp_ip->addr;
2637 timeout = TAKEOVER_TIMEOUT();
2638 data.dsize = sizeof(ip);
2639 data.dptr = (uint8_t *)&ip;
2640 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
2641 0, CTDB_CONTROL_RELEASE_IP, 0,
2642 data, async_data,
2643 &timeout, NULL);
2646 if (state == NULL) {
2647 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
2648 talloc_free(tmp_ctx);
2649 return -1;
2652 ctdb_client_async_add(async_data, state);
2655 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2656 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
2657 talloc_free(tmp_ctx);
2658 return -1;
2660 talloc_free(async_data);
2663 /* tell all nodes to get their own IPs */
2664 async_data = talloc_zero(tmp_ctx, struct client_async_data);
2665 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
2667 async_data->fail_callback = fail_callback;
2668 async_data->callback_data = callback_data;
2670 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
2671 if (tmp_ip->pnn == -1) {
2672 /* this IP won't be taken over */
2673 continue;
2676 if (tmp_ip->addr.sa.sa_family == AF_INET) {
2677 ipv4.pnn = tmp_ip->pnn;
2678 ipv4.sin = tmp_ip->addr.ip;
2680 timeout = TAKEOVER_TIMEOUT();
2681 data.dsize = sizeof(ipv4);
2682 data.dptr = (uint8_t *)&ipv4;
2683 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2684 0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
2685 data, async_data,
2686 &timeout, NULL);
2687 } else {
2688 ip.pnn = tmp_ip->pnn;
2689 ip.addr = tmp_ip->addr;
2691 timeout = TAKEOVER_TIMEOUT();
2692 data.dsize = sizeof(ip);
2693 data.dptr = (uint8_t *)&ip;
2694 state = ctdb_control_send(ctdb, tmp_ip->pnn,
2695 0, CTDB_CONTROL_TAKEOVER_IP, 0,
2696 data, async_data,
2697 &timeout, NULL);
2699 if (state == NULL) {
2700 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
2701 talloc_free(tmp_ctx);
2702 return -1;
2705 ctdb_client_async_add(async_data, state);
2707 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
2708 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
2709 talloc_free(tmp_ctx);
2710 return -1;
2713 ipreallocated:
2715 * Tell all nodes to run eventscripts to process the
2716 * "ipreallocated" event. This can do a lot of things,
2717 * including restarting services to reconfigure them if public
2718 * IPs have moved. Once upon a time this event only used to
2719 * update natwg.
2721 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2722 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
2723 nodes, 0, TAKEOVER_TIMEOUT(),
2724 false, tdb_null,
2725 NULL, fail_callback,
2726 callback_data) != 0) {
2727 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
2730 talloc_free(tmp_ctx);
2731 return 0;
2736 destroy a ctdb_client_ip structure
2738 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2740 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2741 ctdb_addr_to_str(&ip->addr),
2742 ntohs(ip->addr.ip.sin_port),
2743 ip->client_id));
2745 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2746 return 0;
2750 called by a client to inform us of a TCP connection that it is managing
2751 that should tickled with an ACK when IP takeover is done
2752 we handle both the old ipv4 style of packets as well as the new ipv4/6
2753 pdus.
2755 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2756 TDB_DATA indata)
2758 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
2759 struct ctdb_control_tcp *old_addr = NULL;
2760 struct ctdb_control_tcp_addr new_addr;
2761 struct ctdb_control_tcp_addr *tcp_sock = NULL;
2762 struct ctdb_tcp_list *tcp;
2763 struct ctdb_tcp_connection t;
2764 int ret;
2765 TDB_DATA data;
2766 struct ctdb_client_ip *ip;
2767 struct ctdb_vnn *vnn;
2768 ctdb_sock_addr addr;
2770 switch (indata.dsize) {
2771 case sizeof(struct ctdb_control_tcp):
2772 old_addr = (struct ctdb_control_tcp *)indata.dptr;
2773 ZERO_STRUCT(new_addr);
2774 tcp_sock = &new_addr;
2775 tcp_sock->src.ip = old_addr->src;
2776 tcp_sock->dest.ip = old_addr->dest;
2777 break;
2778 case sizeof(struct ctdb_control_tcp_addr):
2779 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
2780 break;
2781 default:
2782 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
2783 "to ctdb_control_tcp_client. size was %d but "
2784 "only allowed sizes are %lu and %lu\n",
2785 (int)indata.dsize,
2786 (long unsigned)sizeof(struct ctdb_control_tcp),
2787 (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
2788 return -1;
2791 addr = tcp_sock->src;
2792 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2793 addr = tcp_sock->dest;
2794 ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
2796 ZERO_STRUCT(addr);
2797 memcpy(&addr, &tcp_sock->dest, sizeof(addr));
2798 vnn = find_public_ip_vnn(ctdb, &addr);
2799 if (vnn == NULL) {
2800 switch (addr.sa.sa_family) {
2801 case AF_INET:
2802 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2803 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2804 ctdb_addr_to_str(&addr)));
2806 break;
2807 case AF_INET6:
2808 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2809 ctdb_addr_to_str(&addr)));
2810 break;
2811 default:
2812 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2815 return 0;
2818 if (vnn->pnn != ctdb->pnn) {
2819 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2820 ctdb_addr_to_str(&addr),
2821 client_id, client->pid));
2822 /* failing this call will tell smbd to die */
2823 return -1;
2826 ip = talloc(client, struct ctdb_client_ip);
2827 CTDB_NO_MEMORY(ctdb, ip);
2829 ip->ctdb = ctdb;
2830 ip->addr = addr;
2831 ip->client_id = client_id;
2832 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2833 DLIST_ADD(ctdb->client_ip_list, ip);
2835 tcp = talloc(client, struct ctdb_tcp_list);
2836 CTDB_NO_MEMORY(ctdb, tcp);
2838 tcp->connection.src_addr = tcp_sock->src;
2839 tcp->connection.dst_addr = tcp_sock->dest;
2841 DLIST_ADD(client->tcp_list, tcp);
2843 t.src_addr = tcp_sock->src;
2844 t.dst_addr = tcp_sock->dest;
2846 data.dptr = (uint8_t *)&t;
2847 data.dsize = sizeof(t);
2849 switch (addr.sa.sa_family) {
2850 case AF_INET:
2851 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2852 (unsigned)ntohs(tcp_sock->dest.ip.sin_port),
2853 ctdb_addr_to_str(&tcp_sock->src),
2854 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2855 break;
2856 case AF_INET6:
2857 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2858 (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port),
2859 ctdb_addr_to_str(&tcp_sock->src),
2860 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2861 break;
2862 default:
2863 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2867 /* tell all nodes about this tcp connection */
2868 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2869 CTDB_CONTROL_TCP_ADD,
2870 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2871 if (ret != 0) {
2872 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2873 return -1;
2876 return 0;
2880 find a tcp address on a list
2882 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2883 struct ctdb_tcp_connection *tcp)
2885 int i;
2887 if (array == NULL) {
2888 return NULL;
2891 for (i=0;i<array->num;i++) {
2892 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
2893 ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
2894 return &array->connections[i];
2897 return NULL;
2903 called by a daemon to inform us of a TCP connection that one of its
2904 clients managing that should tickled with an ACK when IP takeover is
2905 done
2907 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2909 struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
2910 struct ctdb_tcp_array *tcparray;
2911 struct ctdb_tcp_connection tcp;
2912 struct ctdb_vnn *vnn;
2914 vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
2915 if (vnn == NULL) {
2916 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2917 ctdb_addr_to_str(&p->dst_addr)));
2919 return -1;
2923 tcparray = vnn->tcp_array;
2925 /* If this is the first tickle */
2926 if (tcparray == NULL) {
2927 tcparray = talloc_size(ctdb->nodes,
2928 offsetof(struct ctdb_tcp_array, connections) +
2929 sizeof(struct ctdb_tcp_connection) * 1);
2930 CTDB_NO_MEMORY(ctdb, tcparray);
2931 vnn->tcp_array = tcparray;
2933 tcparray->num = 0;
2934 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
2935 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2937 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2938 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2939 tcparray->num++;
2941 if (tcp_update_needed) {
2942 vnn->tcp_update_needed = true;
2944 return 0;
2948 /* Do we already have this tickle ?*/
2949 tcp.src_addr = p->src_addr;
2950 tcp.dst_addr = p->dst_addr;
2951 if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
2952 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2953 ctdb_addr_to_str(&tcp.dst_addr),
2954 ntohs(tcp.dst_addr.ip.sin_port),
2955 vnn->pnn));
2956 return 0;
2959 /* A new tickle, we must add it to the array */
2960 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2961 struct ctdb_tcp_connection,
2962 tcparray->num+1);
2963 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2965 vnn->tcp_array = tcparray;
2966 tcparray->connections[tcparray->num].src_addr = p->src_addr;
2967 tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
2968 tcparray->num++;
2970 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2971 ctdb_addr_to_str(&tcp.dst_addr),
2972 ntohs(tcp.dst_addr.ip.sin_port),
2973 vnn->pnn));
2975 if (tcp_update_needed) {
2976 vnn->tcp_update_needed = true;
2979 return 0;
2984 called by a daemon to inform us of a TCP connection that one of its
2985 clients managing that should tickled with an ACK when IP takeover is
2986 done
2988 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
2990 struct ctdb_tcp_connection *tcpp;
2991 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
2993 if (vnn == NULL) {
2994 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
2995 ctdb_addr_to_str(&conn->dst_addr)));
2996 return;
2999 /* if the array is empty we cant remove it
3000 and we dont need to do anything
3002 if (vnn->tcp_array == NULL) {
3003 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
3004 ctdb_addr_to_str(&conn->dst_addr),
3005 ntohs(conn->dst_addr.ip.sin_port)));
3006 return;
3010 /* See if we know this connection
3011 if we dont know this connection then we dont need to do anything
3013 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
3014 if (tcpp == NULL) {
3015 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
3016 ctdb_addr_to_str(&conn->dst_addr),
3017 ntohs(conn->dst_addr.ip.sin_port)));
3018 return;
3022 /* We need to remove this entry from the array.
3023 Instead of allocating a new array and copying data to it
3024 we cheat and just copy the last entry in the existing array
3025 to the entry that is to be removed and just shring the
3026 ->num field
3028 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
3029 vnn->tcp_array->num--;
3031 /* If we deleted the last entry we also need to remove the entire array
3033 if (vnn->tcp_array->num == 0) {
3034 talloc_free(vnn->tcp_array);
3035 vnn->tcp_array = NULL;
3038 vnn->tcp_update_needed = true;
3040 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
3041 ctdb_addr_to_str(&conn->src_addr),
3042 ntohs(conn->src_addr.ip.sin_port)));
3047 called by a daemon to inform us of a TCP connection that one of its
3048 clients used are no longer needed in the tickle database
3050 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
3052 struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
3054 ctdb_remove_tcp_connection(ctdb, conn);
3056 return 0;
3061 called when a daemon restarts - send all tickes for all public addresses
3062 we are serving immediately to the new node.
3064 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
3066 /*XXX here we should send all tickes we are serving to the new node */
3067 return 0;
3072 called when a client structure goes away - hook to remove
3073 elements from the tcp_list in all daemons
3075 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
3077 while (client->tcp_list) {
3078 struct ctdb_tcp_list *tcp = client->tcp_list;
3079 DLIST_REMOVE(client->tcp_list, tcp);
3080 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
3086 release all IPs on shutdown
3088 void ctdb_release_all_ips(struct ctdb_context *ctdb)
3090 struct ctdb_vnn *vnn;
3092 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3093 if (!ctdb_sys_have_ip(&vnn->public_address)) {
3094 ctdb_vnn_unassign_iface(ctdb, vnn);
3095 continue;
3097 if (!vnn->iface) {
3098 continue;
3100 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
3101 ctdb_vnn_iface_string(vnn),
3102 ctdb_addr_to_str(&vnn->public_address),
3103 vnn->public_netmask_bits);
3104 release_kill_clients(ctdb, &vnn->public_address);
3105 ctdb_vnn_unassign_iface(ctdb, vnn);
3111 get list of public IPs
3113 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
3114 struct ctdb_req_control *c, TDB_DATA *outdata)
3116 int i, num, len;
3117 struct ctdb_all_public_ips *ips;
3118 struct ctdb_vnn *vnn;
3119 bool only_available = false;
3121 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
3122 only_available = true;
3125 /* count how many public ip structures we have */
3126 num = 0;
3127 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3128 num++;
3131 len = offsetof(struct ctdb_all_public_ips, ips) +
3132 num*sizeof(struct ctdb_public_ip);
3133 ips = talloc_zero_size(outdata, len);
3134 CTDB_NO_MEMORY(ctdb, ips);
3136 i = 0;
3137 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3138 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
3139 continue;
3141 ips->ips[i].pnn = vnn->pnn;
3142 ips->ips[i].addr = vnn->public_address;
3143 i++;
3145 ips->num = i;
3146 len = offsetof(struct ctdb_all_public_ips, ips) +
3147 i*sizeof(struct ctdb_public_ip);
3149 outdata->dsize = len;
3150 outdata->dptr = (uint8_t *)ips;
3152 return 0;
3157 get list of public IPs, old ipv4 style. only returns ipv4 addresses
3159 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb,
3160 struct ctdb_req_control *c, TDB_DATA *outdata)
3162 int i, num, len;
3163 struct ctdb_all_public_ipsv4 *ips;
3164 struct ctdb_vnn *vnn;
3166 /* count how many public ip structures we have */
3167 num = 0;
3168 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3169 if (vnn->public_address.sa.sa_family != AF_INET) {
3170 continue;
3172 num++;
3175 len = offsetof(struct ctdb_all_public_ipsv4, ips) +
3176 num*sizeof(struct ctdb_public_ipv4);
3177 ips = talloc_zero_size(outdata, len);
3178 CTDB_NO_MEMORY(ctdb, ips);
3180 outdata->dsize = len;
3181 outdata->dptr = (uint8_t *)ips;
3183 ips->num = num;
3184 i = 0;
3185 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3186 if (vnn->public_address.sa.sa_family != AF_INET) {
3187 continue;
3189 ips->ips[i].pnn = vnn->pnn;
3190 ips->ips[i].sin = vnn->public_address.ip;
3191 i++;
3194 return 0;
3197 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
3198 struct ctdb_req_control *c,
3199 TDB_DATA indata,
3200 TDB_DATA *outdata)
3202 int i, num, len;
3203 ctdb_sock_addr *addr;
3204 struct ctdb_control_public_ip_info *info;
3205 struct ctdb_vnn *vnn;
3207 addr = (ctdb_sock_addr *)indata.dptr;
3209 vnn = find_public_ip_vnn(ctdb, addr);
3210 if (vnn == NULL) {
3211 /* if it is not a public ip it could be our 'single ip' */
3212 if (ctdb->single_ip_vnn) {
3213 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
3214 vnn = ctdb->single_ip_vnn;
3218 if (vnn == NULL) {
3219 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
3220 "'%s'not a public address\n",
3221 ctdb_addr_to_str(addr)));
3222 return -1;
3225 /* count how many public ip structures we have */
3226 num = 0;
3227 for (;vnn->ifaces[num];) {
3228 num++;
3231 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3232 num*sizeof(struct ctdb_control_iface_info);
3233 info = talloc_zero_size(outdata, len);
3234 CTDB_NO_MEMORY(ctdb, info);
3236 info->ip.addr = vnn->public_address;
3237 info->ip.pnn = vnn->pnn;
3238 info->active_idx = 0xFFFFFFFF;
3240 for (i=0; vnn->ifaces[i]; i++) {
3241 struct ctdb_iface *cur;
3243 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
3244 if (cur == NULL) {
3245 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
3246 vnn->ifaces[i]));
3247 return -1;
3249 if (vnn->iface == cur) {
3250 info->active_idx = i;
3252 strcpy(info->ifaces[i].name, cur->name);
3253 info->ifaces[i].link_state = cur->link_up;
3254 info->ifaces[i].references = cur->references;
3256 info->num = i;
3257 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
3258 i*sizeof(struct ctdb_control_iface_info);
3260 outdata->dsize = len;
3261 outdata->dptr = (uint8_t *)info;
3263 return 0;
3266 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
3267 struct ctdb_req_control *c,
3268 TDB_DATA *outdata)
3270 int i, num, len;
3271 struct ctdb_control_get_ifaces *ifaces;
3272 struct ctdb_iface *cur;
3274 /* count how many public ip structures we have */
3275 num = 0;
3276 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3277 num++;
3280 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3281 num*sizeof(struct ctdb_control_iface_info);
3282 ifaces = talloc_zero_size(outdata, len);
3283 CTDB_NO_MEMORY(ctdb, ifaces);
3285 i = 0;
3286 for (cur=ctdb->ifaces;cur;cur=cur->next) {
3287 strcpy(ifaces->ifaces[i].name, cur->name);
3288 ifaces->ifaces[i].link_state = cur->link_up;
3289 ifaces->ifaces[i].references = cur->references;
3290 i++;
3292 ifaces->num = i;
3293 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
3294 i*sizeof(struct ctdb_control_iface_info);
3296 outdata->dsize = len;
3297 outdata->dptr = (uint8_t *)ifaces;
3299 return 0;
3302 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
3303 struct ctdb_req_control *c,
3304 TDB_DATA indata)
3306 struct ctdb_control_iface_info *info;
3307 struct ctdb_iface *iface;
3308 bool link_up = false;
3310 info = (struct ctdb_control_iface_info *)indata.dptr;
3312 if (info->name[CTDB_IFACE_SIZE] != '\0') {
3313 int len = strnlen(info->name, CTDB_IFACE_SIZE);
3314 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
3315 len, len, info->name));
3316 return -1;
3319 switch (info->link_state) {
3320 case 0:
3321 link_up = false;
3322 break;
3323 case 1:
3324 link_up = true;
3325 break;
3326 default:
3327 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
3328 (unsigned int)info->link_state));
3329 return -1;
3332 if (info->references != 0) {
3333 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
3334 (unsigned int)info->references));
3335 return -1;
3338 iface = ctdb_find_iface(ctdb, info->name);
3339 if (iface == NULL) {
3340 return -1;
3343 if (link_up == iface->link_up) {
3344 return 0;
3347 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
3348 ("iface[%s] has changed it's link status %s => %s\n",
3349 iface->name,
3350 iface->link_up?"up":"down",
3351 link_up?"up":"down"));
3353 iface->link_up = link_up;
3354 return 0;
3359 structure containing the listening socket and the list of tcp connections
3360 that the ctdb daemon is to kill
3362 struct ctdb_kill_tcp {
3363 struct ctdb_vnn *vnn;
3364 struct ctdb_context *ctdb;
3365 int capture_fd;
3366 struct fd_event *fde;
3367 trbt_tree_t *connections;
3368 void *private_data;
3372 a tcp connection that is to be killed
3374 struct ctdb_killtcp_con {
3375 ctdb_sock_addr src_addr;
3376 ctdb_sock_addr dst_addr;
3377 int count;
3378 struct ctdb_kill_tcp *killtcp;
3381 /* this function is used to create a key to represent this socketpair
3382 in the killtcp tree.
3383 this key is used to insert and lookup matching socketpairs that are
3384 to be tickled and RST
3386 #define KILLTCP_KEYLEN 10
3387 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
3389 static uint32_t key[KILLTCP_KEYLEN];
3391 bzero(key, sizeof(key));
3393 if (src->sa.sa_family != dst->sa.sa_family) {
3394 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
3395 return key;
3398 switch (src->sa.sa_family) {
3399 case AF_INET:
3400 key[0] = dst->ip.sin_addr.s_addr;
3401 key[1] = src->ip.sin_addr.s_addr;
3402 key[2] = dst->ip.sin_port;
3403 key[3] = src->ip.sin_port;
3404 break;
3405 case AF_INET6: {
3406 uint32_t *dst6_addr32 =
3407 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
3408 uint32_t *src6_addr32 =
3409 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
3410 key[0] = dst6_addr32[3];
3411 key[1] = src6_addr32[3];
3412 key[2] = dst6_addr32[2];
3413 key[3] = src6_addr32[2];
3414 key[4] = dst6_addr32[1];
3415 key[5] = src6_addr32[1];
3416 key[6] = dst6_addr32[0];
3417 key[7] = src6_addr32[0];
3418 key[8] = dst->ip6.sin6_port;
3419 key[9] = src->ip6.sin6_port;
3420 break;
3422 default:
3423 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
3424 return key;
3427 return key;
3431 called when we get a read event on the raw socket
3433 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
3434 uint16_t flags, void *private_data)
3436 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3437 struct ctdb_killtcp_con *con;
3438 ctdb_sock_addr src, dst;
3439 uint32_t ack_seq, seq;
3441 if (!(flags & EVENT_FD_READ)) {
3442 return;
3445 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
3446 killtcp->private_data,
3447 &src, &dst,
3448 &ack_seq, &seq) != 0) {
3449 /* probably a non-tcp ACK packet */
3450 return;
3453 /* check if we have this guy in our list of connections
3454 to kill
3456 con = trbt_lookuparray32(killtcp->connections,
3457 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
3458 if (con == NULL) {
3459 /* no this was some other packet we can just ignore */
3460 return;
3463 /* This one has been tickled !
3464 now reset him and remove him from the list.
3466 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
3467 ntohs(con->dst_addr.ip.sin_port),
3468 ctdb_addr_to_str(&con->src_addr),
3469 ntohs(con->src_addr.ip.sin_port)));
3471 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
3472 talloc_free(con);
3476 /* when traversing the list of all tcp connections to send tickle acks to
3477 (so that we can capture the ack coming back and kill the connection
3478 by a RST)
3479 this callback is called for each connection we are currently trying to kill
3481 static int tickle_connection_traverse(void *param, void *data)
3483 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
3485 /* have tried too many times, just give up */
3486 if (con->count >= 5) {
3487 /* can't delete in traverse: reparent to delete_cons */
3488 talloc_steal(param, con);
3489 return 0;
3492 /* othervise, try tickling it again */
3493 con->count++;
3494 ctdb_sys_send_tcp(
3495 (ctdb_sock_addr *)&con->dst_addr,
3496 (ctdb_sock_addr *)&con->src_addr,
3497 0, 0, 0);
3498 return 0;
3503 called every second until all sentenced connections have been reset
3505 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
3506 struct timeval t, void *private_data)
3508 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
3509 void *delete_cons = talloc_new(NULL);
3511 /* loop over all connections sending tickle ACKs */
3512 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
3514 /* now we've finished traverse, it's safe to do deletion. */
3515 talloc_free(delete_cons);
3517 /* If there are no more connections to kill we can remove the
3518 entire killtcp structure
3520 if ( (killtcp->connections == NULL) ||
3521 (killtcp->connections->root == NULL) ) {
3522 talloc_free(killtcp);
3523 return;
3526 /* try tickling them again in a seconds time
3528 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3529 ctdb_tickle_sentenced_connections, killtcp);
3533 destroy the killtcp structure
3535 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
3537 struct ctdb_vnn *tmpvnn;
3539 /* verify that this vnn is still active */
3540 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
3541 if (tmpvnn == killtcp->vnn) {
3542 break;
3546 if (tmpvnn == NULL) {
3547 return 0;
3550 if (killtcp->vnn->killtcp != killtcp) {
3551 return 0;
3554 killtcp->vnn->killtcp = NULL;
3556 return 0;
3560 /* nothing fancy here, just unconditionally replace any existing
3561 connection structure with the new one.
3563 dont even free the old one if it did exist, that one is talloc_stolen
3564 by the same node in the tree anyway and will be deleted when the new data
3565 is deleted
3567 static void *add_killtcp_callback(void *parm, void *data)
3569 return parm;
3573 add a tcp socket to the list of connections we want to RST
3575 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
3576 ctdb_sock_addr *s,
3577 ctdb_sock_addr *d)
3579 ctdb_sock_addr src, dst;
3580 struct ctdb_kill_tcp *killtcp;
3581 struct ctdb_killtcp_con *con;
3582 struct ctdb_vnn *vnn;
3584 ctdb_canonicalize_ip(s, &src);
3585 ctdb_canonicalize_ip(d, &dst);
3587 vnn = find_public_ip_vnn(ctdb, &dst);
3588 if (vnn == NULL) {
3589 vnn = find_public_ip_vnn(ctdb, &src);
3591 if (vnn == NULL) {
3592 /* if it is not a public ip it could be our 'single ip' */
3593 if (ctdb->single_ip_vnn) {
3594 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
3595 vnn = ctdb->single_ip_vnn;
3599 if (vnn == NULL) {
3600 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
3601 return -1;
3604 killtcp = vnn->killtcp;
3606 /* If this is the first connection to kill we must allocate
3607 a new structure
3609 if (killtcp == NULL) {
3610 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
3611 CTDB_NO_MEMORY(ctdb, killtcp);
3613 killtcp->vnn = vnn;
3614 killtcp->ctdb = ctdb;
3615 killtcp->capture_fd = -1;
3616 killtcp->connections = trbt_create(killtcp, 0);
3618 vnn->killtcp = killtcp;
3619 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
3624 /* create a structure that describes this connection we want to
3625 RST and store it in killtcp->connections
3627 con = talloc(killtcp, struct ctdb_killtcp_con);
3628 CTDB_NO_MEMORY(ctdb, con);
3629 con->src_addr = src;
3630 con->dst_addr = dst;
3631 con->count = 0;
3632 con->killtcp = killtcp;
3635 trbt_insertarray32_callback(killtcp->connections,
3636 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
3637 add_killtcp_callback, con);
3640 If we dont have a socket to listen on yet we must create it
3642 if (killtcp->capture_fd == -1) {
3643 const char *iface = ctdb_vnn_iface_string(vnn);
3644 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
3645 if (killtcp->capture_fd == -1) {
3646 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
3647 "socket on iface '%s' for killtcp (%s)\n",
3648 iface, strerror(errno)));
3649 goto failed;
3654 if (killtcp->fde == NULL) {
3655 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
3656 EVENT_FD_READ,
3657 capture_tcp_handler, killtcp);
3658 tevent_fd_set_auto_close(killtcp->fde);
3660 /* We also need to set up some events to tickle all these connections
3661 until they are all reset
3663 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
3664 ctdb_tickle_sentenced_connections, killtcp);
3667 /* tickle him once now */
3668 ctdb_sys_send_tcp(
3669 &con->dst_addr,
3670 &con->src_addr,
3671 0, 0, 0);
3673 return 0;
3675 failed:
3676 talloc_free(vnn->killtcp);
3677 vnn->killtcp = NULL;
3678 return -1;
3682 kill a TCP connection.
3684 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
3686 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
3688 return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
3692 called by a daemon to inform us of the entire list of TCP tickles for
3693 a particular public address.
3694 this control should only be sent by the node that is currently serving
3695 that public address.
3697 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
3699 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
3700 struct ctdb_tcp_array *tcparray;
3701 struct ctdb_vnn *vnn;
3703 /* We must at least have tickles.num or else we cant verify the size
3704 of the received data blob
3706 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3707 tickles.connections)) {
3708 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
3709 return -1;
3712 /* verify that the size of data matches what we expect */
3713 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
3714 tickles.connections)
3715 + sizeof(struct ctdb_tcp_connection)
3716 * list->tickles.num) {
3717 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
3718 return -1;
3721 vnn = find_public_ip_vnn(ctdb, &list->addr);
3722 if (vnn == NULL) {
3723 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3724 ctdb_addr_to_str(&list->addr)));
3726 return 1;
3729 /* remove any old ticklelist we might have */
3730 talloc_free(vnn->tcp_array);
3731 vnn->tcp_array = NULL;
3733 tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
3734 CTDB_NO_MEMORY(ctdb, tcparray);
3736 tcparray->num = list->tickles.num;
3738 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
3739 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3741 memcpy(tcparray->connections, &list->tickles.connections[0],
3742 sizeof(struct ctdb_tcp_connection)*tcparray->num);
3744 /* We now have a new fresh tickle list array for this vnn */
3745 vnn->tcp_array = talloc_steal(vnn, tcparray);
3747 return 0;
3751 called to return the full list of tickles for the puclic address associated
3752 with the provided vnn
3754 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3756 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3757 struct ctdb_control_tcp_tickle_list *list;
3758 struct ctdb_tcp_array *tcparray;
3759 int num;
3760 struct ctdb_vnn *vnn;
3762 vnn = find_public_ip_vnn(ctdb, addr);
3763 if (vnn == NULL) {
3764 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3765 ctdb_addr_to_str(addr)));
3767 return 1;
3770 tcparray = vnn->tcp_array;
3771 if (tcparray) {
3772 num = tcparray->num;
3773 } else {
3774 num = 0;
3777 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3778 tickles.connections)
3779 + sizeof(struct ctdb_tcp_connection) * num;
3781 outdata->dptr = talloc_size(outdata, outdata->dsize);
3782 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3783 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
3785 list->addr = *addr;
3786 list->tickles.num = num;
3787 if (num) {
3788 memcpy(&list->tickles.connections[0], tcparray->connections,
3789 sizeof(struct ctdb_tcp_connection) * num);
3792 return 0;
3797 set the list of all tcp tickles for a public address
3799 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb,
3800 struct timeval timeout, uint32_t destnode,
3801 ctdb_sock_addr *addr,
3802 struct ctdb_tcp_array *tcparray)
3804 int ret, num;
3805 TDB_DATA data;
3806 struct ctdb_control_tcp_tickle_list *list;
3808 if (tcparray) {
3809 num = tcparray->num;
3810 } else {
3811 num = 0;
3814 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
3815 tickles.connections) +
3816 sizeof(struct ctdb_tcp_connection) * num;
3817 data.dptr = talloc_size(ctdb, data.dsize);
3818 CTDB_NO_MEMORY(ctdb, data.dptr);
3820 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
3821 list->addr = *addr;
3822 list->tickles.num = num;
3823 if (tcparray) {
3824 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
3827 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
3828 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3829 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3830 if (ret != 0) {
3831 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3832 return -1;
3835 talloc_free(data.dptr);
3837 return ret;
3842 perform tickle updates if required
3844 static void ctdb_update_tcp_tickles(struct event_context *ev,
3845 struct timed_event *te,
3846 struct timeval t, void *private_data)
3848 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3849 int ret;
3850 struct ctdb_vnn *vnn;
3852 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3853 /* we only send out updates for public addresses that
3854 we have taken over
3856 if (ctdb->pnn != vnn->pnn) {
3857 continue;
3859 /* We only send out the updates if we need to */
3860 if (!vnn->tcp_update_needed) {
3861 continue;
3863 ret = ctdb_ctrl_set_tcp_tickles(ctdb,
3864 TAKEOVER_TIMEOUT(),
3865 CTDB_BROADCAST_CONNECTED,
3866 &vnn->public_address,
3867 vnn->tcp_array);
3868 if (ret != 0) {
3869 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3870 ctdb_addr_to_str(&vnn->public_address)));
3874 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3875 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3876 ctdb_update_tcp_tickles, ctdb);
3881 start periodic update of tcp tickles
3883 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3885 ctdb->tickle_update_context = talloc_new(ctdb);
3887 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
3888 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3889 ctdb_update_tcp_tickles, ctdb);
3895 struct control_gratious_arp {
3896 struct ctdb_context *ctdb;
3897 ctdb_sock_addr addr;
3898 const char *iface;
3899 int count;
3903 send a control_gratuitous arp
3905 static void send_gratious_arp(struct event_context *ev, struct timed_event *te,
3906 struct timeval t, void *private_data)
3908 int ret;
3909 struct control_gratious_arp *arp = talloc_get_type(private_data,
3910 struct control_gratious_arp);
3912 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3913 if (ret != 0) {
3914 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3915 arp->iface, strerror(errno)));
3919 arp->count++;
3920 if (arp->count == CTDB_ARP_REPEAT) {
3921 talloc_free(arp);
3922 return;
3925 event_add_timed(arp->ctdb->ev, arp,
3926 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3927 send_gratious_arp, arp);
3932 send a gratious arp
3934 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3936 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
3937 struct control_gratious_arp *arp;
3939 /* verify the size of indata */
3940 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
3941 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
3942 (unsigned)indata.dsize,
3943 (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
3944 return -1;
3946 if (indata.dsize !=
3947 ( offsetof(struct ctdb_control_gratious_arp, iface)
3948 + gratious_arp->len ) ){
3950 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3951 "but should be %u bytes\n",
3952 (unsigned)indata.dsize,
3953 (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
3954 return -1;
3958 arp = talloc(ctdb, struct control_gratious_arp);
3959 CTDB_NO_MEMORY(ctdb, arp);
3961 arp->ctdb = ctdb;
3962 arp->addr = gratious_arp->addr;
3963 arp->iface = talloc_strdup(arp, gratious_arp->iface);
3964 CTDB_NO_MEMORY(ctdb, arp->iface);
3965 arp->count = 0;
3967 event_add_timed(arp->ctdb->ev, arp,
3968 timeval_zero(), send_gratious_arp, arp);
3970 return 0;
3973 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3975 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
3976 int ret;
3978 /* verify the size of indata */
3979 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
3980 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
3981 return -1;
3983 if (indata.dsize !=
3984 ( offsetof(struct ctdb_control_ip_iface, iface)
3985 + pub->len ) ){
3987 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3988 "but should be %u bytes\n",
3989 (unsigned)indata.dsize,
3990 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
3991 return -1;
3994 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
3996 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3998 if (ret != 0) {
3999 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
4000 return -1;
4003 return 0;
4007 called when releaseip event finishes for del_public_address
4009 static void delete_ip_callback(struct ctdb_context *ctdb, int status,
4010 void *private_data)
4012 talloc_free(private_data);
4015 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
4017 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
4018 struct ctdb_vnn *vnn;
4019 int ret;
4021 /* verify the size of indata */
4022 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
4023 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
4024 return -1;
4026 if (indata.dsize !=
4027 ( offsetof(struct ctdb_control_ip_iface, iface)
4028 + pub->len ) ){
4030 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
4031 "but should be %u bytes\n",
4032 (unsigned)indata.dsize,
4033 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
4034 return -1;
4037 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
4039 /* walk over all public addresses until we find a match */
4040 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
4041 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
4042 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
4044 DLIST_REMOVE(ctdb->vnn, vnn);
4045 talloc_steal(mem_ctx, vnn);
4046 ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
4047 if (vnn->pnn != ctdb->pnn) {
4048 if (vnn->iface != NULL) {
4049 ctdb_vnn_unassign_iface(ctdb, vnn);
4051 talloc_free(mem_ctx);
4052 return 0;
4054 vnn->pnn = -1;
4056 ret = ctdb_event_script_callback(ctdb,
4057 mem_ctx, delete_ip_callback, mem_ctx,
4058 false,
4059 CTDB_EVENT_RELEASE_IP,
4060 "%s %s %u",
4061 ctdb_vnn_iface_string(vnn),
4062 ctdb_addr_to_str(&vnn->public_address),
4063 vnn->public_netmask_bits);
4064 if (vnn->iface != NULL) {
4065 ctdb_vnn_unassign_iface(ctdb, vnn);
4067 if (ret != 0) {
4068 return -1;
4070 return 0;
4074 return -1;
4078 struct ipreallocated_callback_state {
4079 struct ctdb_req_control *c;
4082 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
4083 int status, void *p)
4085 struct ipreallocated_callback_state *state =
4086 talloc_get_type(p, struct ipreallocated_callback_state);
4088 if (status != 0) {
4089 DEBUG(DEBUG_ERR,
4090 (" \"ipreallocated\" event script failed (status %d)\n",
4091 status));
4092 if (status == -ETIME) {
4093 ctdb_ban_self(ctdb);
4097 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
4098 talloc_free(state);
4101 /* A control to run the ipreallocated event */
4102 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
4103 struct ctdb_req_control *c,
4104 bool *async_reply)
4106 int ret;
4107 struct ipreallocated_callback_state *state;
4109 state = talloc(ctdb, struct ipreallocated_callback_state);
4110 CTDB_NO_MEMORY(ctdb, state);
4112 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
4114 ret = ctdb_event_script_callback(ctdb, state,
4115 ctdb_ipreallocated_callback, state,
4116 false, CTDB_EVENT_IPREALLOCATED,
4117 "%s", "");
4119 if (ret != 0) {
4120 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
4121 talloc_free(state);
4122 return -1;
4125 /* tell the control that we will be reply asynchronously */
4126 state->c = talloc_steal(state, c);
4127 *async_reply = true;
4129 return 0;
4133 /* This function is called from the recovery daemon to verify that a remote
4134 node has the expected ip allocation.
4135 This is verified against ctdb->ip_tree
4137 int verify_remote_ip_allocation(struct ctdb_context *ctdb, struct ctdb_all_public_ips *ips)
4139 struct ctdb_public_ip_list *tmp_ip;
4140 int i;
4142 if (ctdb->ip_tree == NULL) {
4143 /* dont know the expected allocation yet, assume remote node
4144 is correct. */
4145 return 0;
4148 if (ips == NULL) {
4149 return 0;
4152 for (i=0; i<ips->num; i++) {
4153 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
4154 if (tmp_ip == NULL) {
4155 DEBUG(DEBUG_ERR,(__location__ " Could not find host for address %s, reassign ips\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4156 return -1;
4159 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
4160 continue;
4163 if (tmp_ip->pnn != ips->ips[i].pnn) {
4164 DEBUG(DEBUG_ERR,("Inconsistent ip allocation. Trigger reallocation. Thinks %s is held by node %u while it is held by node %u\n", ctdb_addr_to_str(&ips->ips[i].addr), ips->ips[i].pnn, tmp_ip->pnn));
4165 return -1;
4169 return 0;
4172 int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
4174 struct ctdb_public_ip_list *tmp_ip;
4176 if (ctdb->ip_tree == NULL) {
4177 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
4178 return -1;
4181 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
4182 if (tmp_ip == NULL) {
4183 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
4184 return -1;
4187 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
4188 tmp_ip->pnn = ip->pnn;
4190 return 0;
4194 struct ctdb_reloadips_handle {
4195 struct ctdb_context *ctdb;
4196 struct ctdb_req_control *c;
4197 int status;
4198 int fd[2];
4199 pid_t child;
4200 struct fd_event *fde;
4203 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
4205 if (h == h->ctdb->reload_ips) {
4206 h->ctdb->reload_ips = NULL;
4208 if (h->c != NULL) {
4209 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
4210 h->c = NULL;
4212 ctdb_kill(h->ctdb, h->child, SIGKILL);
4213 return 0;
4216 static void ctdb_reloadips_timeout_event(struct event_context *ev,
4217 struct timed_event *te,
4218 struct timeval t, void *private_data)
4220 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4222 talloc_free(h);
4225 static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde,
4226 uint16_t flags, void *private_data)
4228 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
4230 char res;
4231 int ret;
4233 ret = read(h->fd[0], &res, 1);
4234 if (ret < 1 || res != 0) {
4235 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
4236 res = 1;
4238 h->status = res;
4240 talloc_free(h);
4243 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
4245 TALLOC_CTX *mem_ctx = talloc_new(NULL);
4246 struct ctdb_all_public_ips *ips;
4247 struct ctdb_vnn *vnn;
4248 int i, ret;
4250 /* read the ip allocation from the local node */
4251 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, &ips);
4252 if (ret != 0) {
4253 DEBUG(DEBUG_ERR, ("Unable to get public ips from local node\n"));
4254 talloc_free(mem_ctx);
4255 return -1;
4258 /* re-read the public ips file */
4259 ctdb->vnn = NULL;
4260 if (ctdb_set_public_addresses(ctdb, false) != 0) {
4261 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
4262 talloc_free(mem_ctx);
4263 return -1;
4267 /* check the previous list of ips and scan for ips that have been
4268 dropped.
4270 for (i = 0; i < ips->num; i++) {
4271 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4272 if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4273 break;
4277 /* we need to delete this ip, no longer available on this node */
4278 if (vnn == NULL) {
4279 struct ctdb_control_ip_iface pub;
4281 DEBUG(DEBUG_NOTICE,("RELOADIPS: IP%s is no longer available on this node. Deleting it.\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4282 pub.addr = ips->ips[i].addr;
4283 pub.mask = 0;
4284 pub.len = 0;
4286 ret = ctdb_ctrl_del_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4287 if (ret != 0) {
4288 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to del public ip:%s from local node\n", ctdb_addr_to_str(&ips->ips[i].addr)));
4289 return -1;
4295 /* loop over all new ones and check the ones we need to add */
4296 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
4297 for (i = 0; i < ips->num; i++) {
4298 if (ctdb_same_ip(&vnn->public_address, &ips->ips[i].addr)) {
4299 break;
4302 if (i == ips->num) {
4303 struct ctdb_control_ip_iface pub;
4304 const char *ifaces = NULL;
4305 int iface = 0;
4307 DEBUG(DEBUG_NOTICE,("RELOADIPS: New ip:%s found, adding it.\n", ctdb_addr_to_str(&vnn->public_address)));
4309 pub.addr = vnn->public_address;
4310 pub.mask = vnn->public_netmask_bits;
4313 ifaces = vnn->ifaces[0];
4314 iface = 1;
4315 while (vnn->ifaces[iface] != NULL) {
4316 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces, vnn->ifaces[iface]);
4317 iface++;
4319 pub.len = strlen(ifaces)+1;
4320 memcpy(&pub.iface[0], ifaces, strlen(ifaces)+1);
4322 ret = ctdb_ctrl_add_public_ip(ctdb, TAKEOVER_TIMEOUT(), CTDB_CURRENT_NODE, &pub);
4323 if (ret != 0) {
4324 DEBUG(DEBUG_ERR, ("RELOADIPS: Unable to add public ip:%s to local node\n", ctdb_addr_to_str(&vnn->public_address)));
4325 return -1;
4330 return 0;
4333 /* This control is sent to force the node to re-read the public addresses file
4334 and drop any addresses we should nnot longer host, and add new addresses
4335 that we are now able to host
4337 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
4339 struct ctdb_reloadips_handle *h;
4340 pid_t parent = getpid();
4342 if (ctdb->reload_ips != NULL) {
4343 talloc_free(ctdb->reload_ips);
4344 ctdb->reload_ips = NULL;
4347 h = talloc(ctdb, struct ctdb_reloadips_handle);
4348 CTDB_NO_MEMORY(ctdb, h);
4349 h->ctdb = ctdb;
4350 h->c = NULL;
4351 h->status = -1;
4353 if (pipe(h->fd) == -1) {
4354 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
4355 talloc_free(h);
4356 return -1;
4359 h->child = ctdb_fork(ctdb);
4360 if (h->child == (pid_t)-1) {
4361 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
4362 close(h->fd[0]);
4363 close(h->fd[1]);
4364 talloc_free(h);
4365 return -1;
4368 /* child process */
4369 if (h->child == 0) {
4370 signed char res = 0;
4372 close(h->fd[0]);
4373 debug_extra = talloc_asprintf(NULL, "reloadips:");
4375 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
4376 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
4377 res = -1;
4378 } else {
4379 res = ctdb_reloadips_child(ctdb);
4380 if (res != 0) {
4381 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
4385 write(h->fd[1], &res, 1);
4386 /* make sure we die when our parent dies */
4387 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
4388 sleep(5);
4390 _exit(0);
4393 h->c = talloc_steal(h, c);
4395 close(h->fd[1]);
4396 set_close_on_exec(h->fd[0]);
4398 talloc_set_destructor(h, ctdb_reloadips_destructor);
4401 h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
4402 EVENT_FD_READ, ctdb_reloadips_child_handler,
4403 (void *)h);
4404 tevent_fd_set_auto_close(h->fde);
4406 event_add_timed(ctdb->ev, h,
4407 timeval_current_ofs(120, 0),
4408 ctdb_reloadips_timeout_event, h);
4410 /* we reply later */
4411 *async_reply = true;
4412 return 0;