tdb: Add tdb_trace_1plusn_rec_flag_ret
[Samba.git] / ctdb / server / ctdb_takeover.c
blobff096ce1a3d98cc4e2dada846ae90ba7351931c9
1 /*
2 ctdb ip takeover code
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "replace.h"
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/time.h"
25 #include "system/wait.h"
27 #include <talloc.h>
28 #include <tevent.h>
30 #include "lib/util/dlinklist.h"
31 #include "lib/util/debug.h"
32 #include "lib/util/samba_util.h"
33 #include "lib/util/util_process.h"
35 #include "ctdb_private.h"
36 #include "ctdb_client.h"
38 #include "common/rb_tree.h"
39 #include "common/reqid.h"
40 #include "common/system.h"
41 #include "common/common.h"
42 #include "common/logging.h"
44 #include "server/ipalloc.h"
46 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
48 #define CTDB_ARP_INTERVAL 1
49 #define CTDB_ARP_REPEAT 3
51 struct ctdb_interface {
52 struct ctdb_interface *prev, *next;
53 const char *name;
54 bool link_up;
55 uint32_t references;
58 /* state associated with a public ip address */
59 struct ctdb_vnn {
60 struct ctdb_vnn *prev, *next;
62 struct ctdb_interface *iface;
63 const char **ifaces;
64 ctdb_sock_addr public_address;
65 uint8_t public_netmask_bits;
67 /* the node number that is serving this public address, if any.
68 If no node serves this ip it is set to -1 */
69 int32_t pnn;
71 /* List of clients to tickle for this public address */
72 struct ctdb_tcp_array *tcp_array;
74 /* whether we need to update the other nodes with changes to our list
75 of connected clients */
76 bool tcp_update_needed;
78 /* a context to hang sending gratious arp events off */
79 TALLOC_CTX *takeover_ctx;
81 /* Set to true any time an update to this VNN is in flight.
82 This helps to avoid races. */
83 bool update_in_flight;
85 /* If CTDB_CONTROL_DEL_PUBLIC_IP is received for this IP
86 * address then this flag is set. It will be deleted in the
87 * release IP callback. */
88 bool delete_pending;
91 static const char *iface_string(const struct ctdb_interface *iface)
93 return (iface != NULL ? iface->name : "__none__");
96 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
98 return iface_string(vnn->iface);
101 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
103 struct ctdb_interface *i;
105 if (strlen(iface) > CTDB_IFACE_SIZE) {
106 DEBUG(DEBUG_ERR, ("Interface name too long \"%s\"\n", iface));
107 return -1;
110 /* Verify that we don't have an entry for this ip yet */
111 for (i=ctdb->ifaces;i;i=i->next) {
112 if (strcmp(i->name, iface) == 0) {
113 return 0;
117 /* create a new structure for this interface */
118 i = talloc_zero(ctdb, struct ctdb_interface);
119 CTDB_NO_MEMORY_FATAL(ctdb, i);
120 i->name = talloc_strdup(i, iface);
121 CTDB_NO_MEMORY(ctdb, i->name);
123 i->link_up = true;
125 DLIST_ADD(ctdb->ifaces, i);
127 return 0;
130 static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
131 const char *name)
133 int n;
135 for (n = 0; vnn->ifaces[n] != NULL; n++) {
136 if (strcmp(name, vnn->ifaces[n]) == 0) {
137 return true;
141 return false;
144 /* If any interfaces now have no possible IPs then delete them. This
145 * implementation is naive (i.e. simple) rather than clever
146 * (i.e. complex). Given that this is run on delip and that operation
147 * is rare, this doesn't need to be efficient - it needs to be
148 * foolproof. One alternative is reference counting, where the logic
149 * is distributed and can, therefore, be broken in multiple places.
150 * Another alternative is to build a red-black tree of interfaces that
151 * can have addresses (by walking ctdb->vnn once) and then walking
152 * ctdb->ifaces once and deleting those not in the tree. Let's go to
153 * one of those if the naive implementation causes problems... :-)
155 static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
156 struct ctdb_vnn *vnn)
158 struct ctdb_interface *i, *next;
160 /* For each interface, check if there's an IP using it. */
161 for (i = ctdb->ifaces; i != NULL; i = next) {
162 struct ctdb_vnn *tv;
163 bool found;
164 next = i->next;
166 /* Only consider interfaces named in the given VNN. */
167 if (!vnn_has_interface_with_name(vnn, i->name)) {
168 continue;
171 /* Search for a vnn with this interface. */
172 found = false;
173 for (tv=ctdb->vnn; tv; tv=tv->next) {
174 if (vnn_has_interface_with_name(tv, i->name)) {
175 found = true;
176 break;
180 if (!found) {
181 /* None of the VNNs are using this interface. */
182 DLIST_REMOVE(ctdb->ifaces, i);
183 talloc_free(i);
189 static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
190 const char *iface)
192 struct ctdb_interface *i;
194 for (i=ctdb->ifaces;i;i=i->next) {
195 if (strcmp(i->name, iface) == 0) {
196 return i;
200 return NULL;
203 static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
204 struct ctdb_vnn *vnn)
206 int i;
207 struct ctdb_interface *cur = NULL;
208 struct ctdb_interface *best = NULL;
210 for (i=0; vnn->ifaces[i]; i++) {
212 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
213 if (cur == NULL) {
214 continue;
217 if (!cur->link_up) {
218 continue;
221 if (best == NULL) {
222 best = cur;
223 continue;
226 if (cur->references < best->references) {
227 best = cur;
228 continue;
232 return best;
235 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
236 struct ctdb_vnn *vnn)
238 struct ctdb_interface *best = NULL;
240 if (vnn->iface) {
241 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
242 "still assigned to iface '%s'\n",
243 ctdb_addr_to_str(&vnn->public_address),
244 ctdb_vnn_iface_string(vnn)));
245 return 0;
248 best = ctdb_vnn_best_iface(ctdb, vnn);
249 if (best == NULL) {
250 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
251 "cannot assign to iface any iface\n",
252 ctdb_addr_to_str(&vnn->public_address)));
253 return -1;
256 vnn->iface = best;
257 best->references++;
258 vnn->pnn = ctdb->pnn;
260 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
261 "now assigned to iface '%s' refs[%d]\n",
262 ctdb_addr_to_str(&vnn->public_address),
263 ctdb_vnn_iface_string(vnn),
264 best->references));
265 return 0;
268 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
269 struct ctdb_vnn *vnn)
271 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
272 "now unassigned (old iface '%s' refs[%d])\n",
273 ctdb_addr_to_str(&vnn->public_address),
274 ctdb_vnn_iface_string(vnn),
275 vnn->iface?vnn->iface->references:0));
276 if (vnn->iface) {
277 vnn->iface->references--;
279 vnn->iface = NULL;
280 if (vnn->pnn == ctdb->pnn) {
281 vnn->pnn = -1;
285 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
286 struct ctdb_vnn *vnn)
288 int i;
290 /* Nodes that are not RUNNING can not host IPs */
291 if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
292 return false;
295 if (vnn->delete_pending) {
296 return false;
299 if (vnn->iface && vnn->iface->link_up) {
300 return true;
303 for (i=0; vnn->ifaces[i]; i++) {
304 struct ctdb_interface *cur;
306 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
307 if (cur == NULL) {
308 continue;
311 if (cur->link_up) {
312 return true;
316 return false;
319 struct ctdb_takeover_arp {
320 struct ctdb_context *ctdb;
321 uint32_t count;
322 ctdb_sock_addr addr;
323 struct ctdb_tcp_array *tcparray;
324 struct ctdb_vnn *vnn;
329 lists of tcp endpoints
331 struct ctdb_tcp_list {
332 struct ctdb_tcp_list *prev, *next;
333 struct ctdb_connection connection;
337 list of clients to kill on IP release
339 struct ctdb_client_ip {
340 struct ctdb_client_ip *prev, *next;
341 struct ctdb_context *ctdb;
342 ctdb_sock_addr addr;
343 uint32_t client_id;
348 send a gratuitous arp
350 static void ctdb_control_send_arp(struct tevent_context *ev,
351 struct tevent_timer *te,
352 struct timeval t, void *private_data)
354 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
355 struct ctdb_takeover_arp);
356 int i, ret;
357 struct ctdb_tcp_array *tcparray;
358 const char *iface = ctdb_vnn_iface_string(arp->vnn);
360 ret = ctdb_sys_send_arp(&arp->addr, iface);
361 if (ret != 0) {
362 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
363 iface, strerror(errno)));
366 tcparray = arp->tcparray;
367 if (tcparray) {
368 for (i=0;i<tcparray->num;i++) {
369 struct ctdb_connection *tcon;
371 tcon = &tcparray->connections[i];
372 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
373 (unsigned)ntohs(tcon->dst.ip.sin_port),
374 ctdb_addr_to_str(&tcon->src),
375 (unsigned)ntohs(tcon->src.ip.sin_port)));
376 ret = ctdb_sys_send_tcp(
377 &tcon->src,
378 &tcon->dst,
379 0, 0, 0);
380 if (ret != 0) {
381 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
382 ctdb_addr_to_str(&tcon->src)));
387 arp->count++;
389 if (arp->count == CTDB_ARP_REPEAT) {
390 talloc_free(arp);
391 return;
394 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
395 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
396 ctdb_control_send_arp, arp);
399 static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
400 struct ctdb_vnn *vnn)
402 struct ctdb_takeover_arp *arp;
403 struct ctdb_tcp_array *tcparray;
405 if (!vnn->takeover_ctx) {
406 vnn->takeover_ctx = talloc_new(vnn);
407 if (!vnn->takeover_ctx) {
408 return -1;
412 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
413 if (!arp) {
414 return -1;
417 arp->ctdb = ctdb;
418 arp->addr = vnn->public_address;
419 arp->vnn = vnn;
421 tcparray = vnn->tcp_array;
422 if (tcparray) {
423 /* add all of the known tcp connections for this IP to the
424 list of tcp connections to send tickle acks for */
425 arp->tcparray = talloc_steal(arp, tcparray);
427 vnn->tcp_array = NULL;
428 vnn->tcp_update_needed = true;
431 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
432 timeval_zero(), ctdb_control_send_arp, arp);
434 return 0;
437 struct ctdb_do_takeip_state {
438 struct ctdb_req_control_old *c;
439 struct ctdb_vnn *vnn;
443 called when takeip event finishes
445 static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
446 void *private_data)
448 struct ctdb_do_takeip_state *state =
449 talloc_get_type(private_data, struct ctdb_do_takeip_state);
450 int32_t ret;
451 TDB_DATA data;
453 if (status != 0) {
454 if (status == -ETIME) {
455 ctdb_ban_self(ctdb);
457 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
458 ctdb_addr_to_str(&state->vnn->public_address),
459 ctdb_vnn_iface_string(state->vnn)));
460 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
462 talloc_free(state);
463 return;
466 if (ctdb->do_checkpublicip) {
468 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
469 if (ret != 0) {
470 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
471 talloc_free(state);
472 return;
477 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
478 data.dsize = strlen((char *)data.dptr) + 1;
479 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
481 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
484 /* the control succeeded */
485 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
486 talloc_free(state);
487 return;
490 static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
492 state->vnn->update_in_flight = false;
493 return 0;
497 take over an ip address
499 static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
500 struct ctdb_req_control_old *c,
501 struct ctdb_vnn *vnn)
503 int ret;
504 struct ctdb_do_takeip_state *state;
506 if (vnn->update_in_flight) {
507 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
508 "update for this IP already in flight\n",
509 ctdb_addr_to_str(&vnn->public_address),
510 vnn->public_netmask_bits));
511 return -1;
514 ret = ctdb_vnn_assign_iface(ctdb, vnn);
515 if (ret != 0) {
516 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
517 "assign a usable interface\n",
518 ctdb_addr_to_str(&vnn->public_address),
519 vnn->public_netmask_bits));
520 return -1;
523 state = talloc(vnn, struct ctdb_do_takeip_state);
524 CTDB_NO_MEMORY(ctdb, state);
526 state->c = talloc_steal(ctdb, c);
527 state->vnn = vnn;
529 vnn->update_in_flight = true;
530 talloc_set_destructor(state, ctdb_takeip_destructor);
532 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
533 ctdb_addr_to_str(&vnn->public_address),
534 vnn->public_netmask_bits,
535 ctdb_vnn_iface_string(vnn)));
537 ret = ctdb_event_script_callback(ctdb,
538 state,
539 ctdb_do_takeip_callback,
540 state,
541 CTDB_EVENT_TAKE_IP,
542 "%s %s %u",
543 ctdb_vnn_iface_string(vnn),
544 ctdb_addr_to_str(&vnn->public_address),
545 vnn->public_netmask_bits);
547 if (ret != 0) {
548 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
549 ctdb_addr_to_str(&vnn->public_address),
550 ctdb_vnn_iface_string(vnn)));
551 talloc_free(state);
552 return -1;
555 return 0;
558 struct ctdb_do_updateip_state {
559 struct ctdb_req_control_old *c;
560 struct ctdb_interface *old;
561 struct ctdb_vnn *vnn;
565 called when updateip event finishes
567 static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
568 void *private_data)
570 struct ctdb_do_updateip_state *state =
571 talloc_get_type(private_data, struct ctdb_do_updateip_state);
572 int32_t ret;
574 if (status != 0) {
575 if (status == -ETIME) {
576 ctdb_ban_self(ctdb);
578 DEBUG(DEBUG_ERR,
579 ("Failed update of IP %s from interface %s to %s\n",
580 ctdb_addr_to_str(&state->vnn->public_address),
581 iface_string(state->old),
582 ctdb_vnn_iface_string(state->vnn)));
585 * All we can do is reset the old interface
586 * and let the next run fix it
588 ctdb_vnn_unassign_iface(ctdb, state->vnn);
589 state->vnn->iface = state->old;
590 state->vnn->iface->references++;
592 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
593 talloc_free(state);
594 return;
597 if (ctdb->do_checkpublicip) {
599 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
600 if (ret != 0) {
601 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
602 talloc_free(state);
603 return;
608 /* the control succeeded */
609 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
610 talloc_free(state);
611 return;
614 static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
616 state->vnn->update_in_flight = false;
617 return 0;
621 update (move) an ip address
623 static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
624 struct ctdb_req_control_old *c,
625 struct ctdb_vnn *vnn)
627 int ret;
628 struct ctdb_do_updateip_state *state;
629 struct ctdb_interface *old = vnn->iface;
630 const char *old_name = iface_string(old);
631 const char *new_name;
633 if (vnn->update_in_flight) {
634 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
635 "update for this IP already in flight\n",
636 ctdb_addr_to_str(&vnn->public_address),
637 vnn->public_netmask_bits));
638 return -1;
641 ctdb_vnn_unassign_iface(ctdb, vnn);
642 ret = ctdb_vnn_assign_iface(ctdb, vnn);
643 if (ret != 0) {
644 DEBUG(DEBUG_ERR,("Update of IP %s/%u failed to "
645 "assign a usable interface (old iface '%s')\n",
646 ctdb_addr_to_str(&vnn->public_address),
647 vnn->public_netmask_bits,
648 old_name));
649 return -1;
652 new_name = ctdb_vnn_iface_string(vnn);
653 if (old_name != NULL && new_name != NULL &&
654 strcmp(old_name, new_name) == 0) {
655 /* A benign update from one interface onto itself.
656 * no need to run the eventscripts in this case, just return
657 * success.
659 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
660 return 0;
663 state = talloc(vnn, struct ctdb_do_updateip_state);
664 CTDB_NO_MEMORY(ctdb, state);
666 state->c = talloc_steal(ctdb, c);
667 state->old = old;
668 state->vnn = vnn;
670 vnn->update_in_flight = true;
671 talloc_set_destructor(state, ctdb_updateip_destructor);
673 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
674 "interface %s to %s\n",
675 ctdb_addr_to_str(&vnn->public_address),
676 vnn->public_netmask_bits,
677 old_name,
678 new_name));
680 ret = ctdb_event_script_callback(ctdb,
681 state,
682 ctdb_do_updateip_callback,
683 state,
684 CTDB_EVENT_UPDATE_IP,
685 "%s %s %s %u",
686 old_name,
687 new_name,
688 ctdb_addr_to_str(&vnn->public_address),
689 vnn->public_netmask_bits);
690 if (ret != 0) {
691 DEBUG(DEBUG_ERR,
692 ("Failed update IP %s from interface %s to %s\n",
693 ctdb_addr_to_str(&vnn->public_address),
694 old_name, new_name));
695 talloc_free(state);
696 return -1;
699 return 0;
703 Find the vnn of the node that has a public ip address
704 returns -1 if the address is not known as a public address
706 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
708 struct ctdb_vnn *vnn;
710 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
711 if (ctdb_same_ip(&vnn->public_address, addr)) {
712 return vnn;
716 return NULL;
720 take over an ip address
722 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
723 struct ctdb_req_control_old *c,
724 TDB_DATA indata,
725 bool *async_reply)
727 int ret;
728 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
729 struct ctdb_vnn *vnn;
730 bool have_ip = false;
731 bool do_updateip = false;
732 bool do_takeip = false;
733 struct ctdb_interface *best_iface = NULL;
735 if (pip->pnn != ctdb->pnn) {
736 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
737 "with pnn %d, but we're node %d\n",
738 ctdb_addr_to_str(&pip->addr),
739 pip->pnn, ctdb->pnn));
740 return -1;
743 /* update out vnn list */
744 vnn = find_public_ip_vnn(ctdb, &pip->addr);
745 if (vnn == NULL) {
746 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
747 ctdb_addr_to_str(&pip->addr)));
748 return 0;
751 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
752 have_ip = ctdb_sys_have_ip(&pip->addr);
754 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
755 if (best_iface == NULL) {
756 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
757 "a usable interface (old %s, have_ip %d)\n",
758 ctdb_addr_to_str(&vnn->public_address),
759 vnn->public_netmask_bits,
760 ctdb_vnn_iface_string(vnn),
761 have_ip));
762 return -1;
765 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
766 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
767 "and we have it on iface[%s], but it was assigned to node %d"
768 "and we are node %d, banning ourself\n",
769 ctdb_addr_to_str(&vnn->public_address),
770 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
771 ctdb_ban_self(ctdb);
772 return -1;
775 if (vnn->pnn == -1 && have_ip) {
776 /* This will cause connections to be reset and
777 * reestablished. However, this is a very unusual
778 * situation and doing this will completely repair the
779 * inconsistency in the VNN.
781 DEBUG(DEBUG_WARNING,
782 (__location__
783 " Doing updateip for IP %s already on an interface\n",
784 ctdb_addr_to_str(&vnn->public_address)));
785 do_updateip = true;
788 if (vnn->iface) {
789 if (vnn->iface != best_iface) {
790 if (!vnn->iface->link_up) {
791 do_updateip = true;
792 } else if (vnn->iface->references > (best_iface->references + 1)) {
793 /* only move when the rebalance gains something */
794 do_updateip = true;
799 if (!have_ip) {
800 if (do_updateip) {
801 ctdb_vnn_unassign_iface(ctdb, vnn);
802 do_updateip = false;
804 do_takeip = true;
807 if (do_takeip) {
808 ret = ctdb_do_takeip(ctdb, c, vnn);
809 if (ret != 0) {
810 return -1;
812 } else if (do_updateip) {
813 ret = ctdb_do_updateip(ctdb, c, vnn);
814 if (ret != 0) {
815 return -1;
817 } else {
819 * The interface is up and the kernel known the ip
820 * => do nothing
822 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
823 ctdb_addr_to_str(&pip->addr),
824 vnn->public_netmask_bits,
825 ctdb_vnn_iface_string(vnn)));
826 return 0;
829 /* tell ctdb_control.c that we will be replying asynchronously */
830 *async_reply = true;
832 return 0;
835 static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
837 DLIST_REMOVE(ctdb->vnn, vnn);
838 ctdb_vnn_unassign_iface(ctdb, vnn);
839 ctdb_remove_orphaned_ifaces(ctdb, vnn);
840 talloc_free(vnn);
843 static struct ctdb_vnn *release_ip_post(struct ctdb_context *ctdb,
844 struct ctdb_vnn *vnn,
845 ctdb_sock_addr *addr)
847 TDB_DATA data;
849 /* Send a message to all clients of this node telling them
850 * that the cluster has been reconfigured and they should
851 * close any connections on this IP address
853 data.dptr = (uint8_t *)ctdb_addr_to_str(addr);
854 data.dsize = strlen((char *)data.dptr)+1;
855 DEBUG(DEBUG_INFO, ("Sending RELEASE_IP message for %s\n", data.dptr));
856 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
858 ctdb_vnn_unassign_iface(ctdb, vnn);
860 /* Process the IP if it has been marked for deletion */
861 if (vnn->delete_pending) {
862 do_delete_ip(ctdb, vnn);
863 return NULL;
866 return vnn;
869 struct release_ip_callback_state {
870 struct ctdb_req_control_old *c;
871 ctdb_sock_addr *addr;
872 struct ctdb_vnn *vnn;
873 uint32_t target_pnn;
877 called when releaseip event finishes
879 static void release_ip_callback(struct ctdb_context *ctdb, int status,
880 void *private_data)
882 struct release_ip_callback_state *state =
883 talloc_get_type(private_data, struct release_ip_callback_state);
885 if (status == -ETIME) {
886 ctdb_ban_self(ctdb);
889 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
890 if (ctdb_sys_have_ip(state->addr)) {
891 DEBUG(DEBUG_ERR,
892 ("IP %s still hosted during release IP callback, failing\n",
893 ctdb_addr_to_str(state->addr)));
894 ctdb_request_control_reply(ctdb, state->c,
895 NULL, -1, NULL);
896 talloc_free(state);
897 return;
901 state->vnn->pnn = state->target_pnn;
902 state->vnn = release_ip_post(ctdb, state->vnn, state->addr);
904 /* the control succeeded */
905 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
906 talloc_free(state);
909 static int ctdb_releaseip_destructor(struct release_ip_callback_state *state)
911 if (state->vnn != NULL) {
912 state->vnn->update_in_flight = false;
914 return 0;
918 release an ip address
920 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
921 struct ctdb_req_control_old *c,
922 TDB_DATA indata,
923 bool *async_reply)
925 int ret;
926 struct release_ip_callback_state *state;
927 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
928 struct ctdb_vnn *vnn;
929 char *iface;
931 /* update our vnn list */
932 vnn = find_public_ip_vnn(ctdb, &pip->addr);
933 if (vnn == NULL) {
934 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
935 ctdb_addr_to_str(&pip->addr)));
936 return 0;
939 /* stop any previous arps */
940 talloc_free(vnn->takeover_ctx);
941 vnn->takeover_ctx = NULL;
943 /* RELEASE_IP controls are sent to all nodes that should not
944 * be hosting a particular IP. This serves 2 purposes. The
945 * first is to help resolve any inconsistencies. If a node
946 * does unexpectly host an IP then it will be released. The
947 * 2nd is to use a "redundant release" to tell non-takeover
948 * nodes where an IP is moving to. This is how "ctdb ip" can
949 * report the (likely) location of an IP by only asking the
950 * local node. Redundant releases need to update the PNN but
951 * are otherwise ignored.
953 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
954 if (!ctdb_sys_have_ip(&pip->addr)) {
955 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
956 ctdb_addr_to_str(&pip->addr),
957 vnn->public_netmask_bits,
958 ctdb_vnn_iface_string(vnn)));
959 vnn->pnn = pip->pnn;
960 ctdb_vnn_unassign_iface(ctdb, vnn);
961 return 0;
963 } else {
964 if (vnn->iface == NULL) {
965 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
966 ctdb_addr_to_str(&pip->addr),
967 vnn->public_netmask_bits));
968 vnn->pnn = pip->pnn;
969 return 0;
973 /* There is a potential race between take_ip and us because we
974 * update the VNN via a callback that run when the
975 * eventscripts have been run. Avoid the race by allowing one
976 * update to be in flight at a time.
978 if (vnn->update_in_flight) {
979 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
980 "update for this IP already in flight\n",
981 ctdb_addr_to_str(&vnn->public_address),
982 vnn->public_netmask_bits));
983 return -1;
986 iface = strdup(ctdb_vnn_iface_string(vnn));
988 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
989 ctdb_addr_to_str(&pip->addr),
990 vnn->public_netmask_bits,
991 iface,
992 pip->pnn));
994 state = talloc(ctdb, struct release_ip_callback_state);
995 if (state == NULL) {
996 ctdb_set_error(ctdb, "Out of memory at %s:%d",
997 __FILE__, __LINE__);
998 free(iface);
999 return -1;
1002 state->c = talloc_steal(state, c);
1003 state->addr = talloc(state, ctdb_sock_addr);
1004 if (state->addr == NULL) {
1005 ctdb_set_error(ctdb, "Out of memory at %s:%d",
1006 __FILE__, __LINE__);
1007 free(iface);
1008 talloc_free(state);
1009 return -1;
1011 *state->addr = pip->addr;
1012 state->target_pnn = pip->pnn;
1013 state->vnn = vnn;
1015 vnn->update_in_flight = true;
1016 talloc_set_destructor(state, ctdb_releaseip_destructor);
1018 ret = ctdb_event_script_callback(ctdb,
1019 state, release_ip_callback, state,
1020 CTDB_EVENT_RELEASE_IP,
1021 "%s %s %u",
1022 iface,
1023 ctdb_addr_to_str(&pip->addr),
1024 vnn->public_netmask_bits);
1025 free(iface);
1026 if (ret != 0) {
1027 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1028 ctdb_addr_to_str(&pip->addr),
1029 ctdb_vnn_iface_string(vnn)));
1030 talloc_free(state);
1031 return -1;
1034 /* tell the control that we will be reply asynchronously */
1035 *async_reply = true;
1036 return 0;
1039 static int ctdb_add_public_address(struct ctdb_context *ctdb,
1040 ctdb_sock_addr *addr,
1041 unsigned mask, const char *ifaces,
1042 bool check_address)
1044 struct ctdb_vnn *vnn;
1045 uint32_t num = 0;
1046 char *tmp;
1047 const char *iface;
1048 int i;
1049 int ret;
1051 tmp = strdup(ifaces);
1052 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1053 if (!ctdb_sys_check_iface_exists(iface)) {
1054 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1055 free(tmp);
1056 return -1;
1059 free(tmp);
1061 /* Verify that we don't have an entry for this ip yet */
1062 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1063 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1064 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1065 ctdb_addr_to_str(addr)));
1066 return -1;
1070 /* create a new vnn structure for this ip address */
1071 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1072 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1073 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1074 tmp = talloc_strdup(vnn, ifaces);
1075 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1076 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1077 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1078 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1079 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1080 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1081 num++;
1083 talloc_free(tmp);
1084 vnn->ifaces[num] = NULL;
1085 vnn->public_address = *addr;
1086 vnn->public_netmask_bits = mask;
1087 vnn->pnn = -1;
1089 for (i=0; vnn->ifaces[i]; i++) {
1090 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1091 if (ret != 0) {
1092 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1093 "for public_address[%s]\n",
1094 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1095 talloc_free(vnn);
1096 return -1;
1100 DLIST_ADD(ctdb->vnn, vnn);
1102 return 0;
1106 setup the public address lists from a file
1108 int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1110 char **lines;
1111 int nlines;
1112 int i;
1114 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1115 if (lines == NULL) {
1116 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1117 return -1;
1119 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1120 nlines--;
1123 for (i=0;i<nlines;i++) {
1124 unsigned mask;
1125 ctdb_sock_addr addr;
1126 const char *addrstr;
1127 const char *ifaces;
1128 char *tok, *line;
1130 line = lines[i];
1131 while ((*line == ' ') || (*line == '\t')) {
1132 line++;
1134 if (*line == '#') {
1135 continue;
1137 if (strcmp(line, "") == 0) {
1138 continue;
1140 tok = strtok(line, " \t");
1141 addrstr = tok;
1142 tok = strtok(NULL, " \t");
1143 if (tok == NULL) {
1144 if (NULL == ctdb->default_public_interface) {
1145 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1146 i+1));
1147 talloc_free(lines);
1148 return -1;
1150 ifaces = ctdb->default_public_interface;
1151 } else {
1152 ifaces = tok;
1155 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1156 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1157 talloc_free(lines);
1158 return -1;
1160 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1161 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1162 talloc_free(lines);
1163 return -1;
1168 talloc_free(lines);
1169 return 0;
1172 static struct ctdb_public_ip_list *
1173 ctdb_fetch_remote_public_ips(struct ctdb_context *ctdb,
1174 TALLOC_CTX *mem_ctx,
1175 struct ctdb_node_map_old *nodemap,
1176 uint32_t public_ip_flags)
1178 int j, ret;
1179 struct ctdb_public_ip_list_old *ip_list;
1180 struct ctdb_public_ip_list *public_ips;
1182 public_ips = talloc_zero_array(mem_ctx,
1183 struct ctdb_public_ip_list,
1184 nodemap->num);
1185 if (public_ips == NULL) {
1186 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1187 return NULL;
1190 for (j = 0; j < nodemap->num; j++) {
1191 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1192 continue;
1195 /* Retrieve the list of public IPs from the
1196 * node. Flags says whether it is known or
1197 * available. */
1198 ret = ctdb_ctrl_get_public_ips_flags(
1199 ctdb, TAKEOVER_TIMEOUT(), j, public_ips,
1200 public_ip_flags, &ip_list);
1201 if (ret != 0) {
1202 DEBUG(DEBUG_ERR,
1203 ("Failed to read public IPs from node: %u\n", j));
1204 talloc_free(public_ips);
1205 return NULL;
1207 public_ips[j].num = ip_list->num;
1208 if (ip_list->num == 0) {
1209 talloc_free(ip_list);
1210 continue;
1212 public_ips[j].ip = talloc_zero_array(public_ips,
1213 struct ctdb_public_ip,
1214 ip_list->num);
1215 if (public_ips[j].ip == NULL) {
1216 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1217 talloc_free(public_ips);
1218 return NULL;
1220 memcpy(public_ips[j].ip, &ip_list->ips[0],
1221 sizeof(struct ctdb_public_ip) * ip_list->num);
1222 talloc_free(ip_list);
1225 return public_ips;
1228 struct get_tunable_callback_data {
1229 const char *tunable;
1230 uint32_t *out;
1231 bool fatal;
1234 static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
1235 int32_t res, TDB_DATA outdata,
1236 void *callback)
1238 struct get_tunable_callback_data *cd =
1239 (struct get_tunable_callback_data *)callback;
1240 int size;
1242 if (res != 0) {
1243 /* Already handled in fail callback */
1244 return;
1247 if (outdata.dsize != sizeof(uint32_t)) {
1248 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
1249 cd->tunable, pnn, (int)sizeof(uint32_t),
1250 (int)outdata.dsize));
1251 cd->fatal = true;
1252 return;
1255 size = talloc_array_length(cd->out);
1256 if (pnn >= size) {
1257 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
1258 cd->tunable, pnn, size));
1259 return;
1263 cd->out[pnn] = *(uint32_t *)outdata.dptr;
1266 static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
1267 int32_t res, TDB_DATA outdata,
1268 void *callback)
1270 struct get_tunable_callback_data *cd =
1271 (struct get_tunable_callback_data *)callback;
1273 switch (res) {
1274 case -ETIME:
1275 DEBUG(DEBUG_ERR,
1276 ("Timed out getting tunable \"%s\" from node %d\n",
1277 cd->tunable, pnn));
1278 cd->fatal = true;
1279 break;
1280 case -EINVAL:
1281 case -1:
1282 DEBUG(DEBUG_WARNING,
1283 ("Tunable \"%s\" not implemented on node %d\n",
1284 cd->tunable, pnn));
1285 break;
1286 default:
1287 DEBUG(DEBUG_ERR,
1288 ("Unexpected error getting tunable \"%s\" from node %d\n",
1289 cd->tunable, pnn));
1290 cd->fatal = true;
1294 static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
1295 TALLOC_CTX *tmp_ctx,
1296 struct ctdb_node_map_old *nodemap,
1297 const char *tunable,
1298 uint32_t default_value)
1300 TDB_DATA data;
1301 struct ctdb_control_get_tunable *t;
1302 uint32_t *nodes;
1303 uint32_t *tvals;
1304 struct get_tunable_callback_data callback_data;
1305 int i;
1307 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1308 CTDB_NO_MEMORY_NULL(ctdb, tvals);
1309 for (i=0; i<nodemap->num; i++) {
1310 tvals[i] = default_value;
1313 callback_data.out = tvals;
1314 callback_data.tunable = tunable;
1315 callback_data.fatal = false;
1317 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
1318 data.dptr = talloc_size(tmp_ctx, data.dsize);
1319 t = (struct ctdb_control_get_tunable *)data.dptr;
1320 t->length = strlen(tunable)+1;
1321 memcpy(t->name, tunable, t->length);
1322 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1323 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
1324 nodes, 0, TAKEOVER_TIMEOUT(),
1325 false, data,
1326 get_tunable_callback,
1327 get_tunable_fail_callback,
1328 &callback_data) != 0) {
1329 if (callback_data.fatal) {
1330 talloc_free(tvals);
1331 tvals = NULL;
1334 talloc_free(nodes);
1335 talloc_free(data.dptr);
1337 return tvals;
1340 static struct ctdb_node_map *
1341 ctdb_node_map_old_to_new(TALLOC_CTX *mem_ctx,
1342 const struct ctdb_node_map_old *old)
1344 struct ctdb_node_map *new;
1346 new = talloc(mem_ctx, struct ctdb_node_map);
1347 if (new == NULL) {
1348 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1349 return NULL;
1351 new->num = old->num;
1352 new->node = talloc_zero_array(new,
1353 struct ctdb_node_and_flags, new->num);
1354 memcpy(new->node, &old->nodes[0],
1355 sizeof(struct ctdb_node_and_flags) * new->num);
1357 return new;
1361 static bool set_ipflags(struct ctdb_context *ctdb,
1362 struct ipalloc_state *ipalloc_state,
1363 struct ctdb_node_map_old *nodemap)
1365 uint32_t *tval_noiptakeover;
1366 uint32_t *tval_noiphostonalldisabled;
1367 struct ctdb_node_map *new;
1369 tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
1370 "NoIPTakeover", 0);
1371 if (tval_noiptakeover == NULL) {
1372 return false;
1375 tval_noiphostonalldisabled =
1376 get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
1377 "NoIPHostOnAllDisabled", 0);
1378 if (tval_noiphostonalldisabled == NULL) {
1379 /* Caller frees tmp_ctx */
1380 return false;
1383 new = ctdb_node_map_old_to_new(ipalloc_state, nodemap);
1384 if (new == NULL) {
1385 return false;
1388 ipalloc_set_node_flags(ipalloc_state, new,
1389 tval_noiptakeover,
1390 tval_noiphostonalldisabled);
1392 talloc_free(tval_noiptakeover);
1393 talloc_free(tval_noiphostonalldisabled);
1394 talloc_free(new);
1396 return true;
1399 static enum ipalloc_algorithm
1400 determine_algorithm(const struct ctdb_tunable_list *tunables)
1402 if (1 == tunables->lcp2_public_ip_assignment) {
1403 return IPALLOC_LCP2;
1404 } else if (1 == tunables->deterministic_public_ips) {
1405 return IPALLOC_DETERMINISTIC;
1406 } else {
1407 return IPALLOC_NONDETERMINISTIC;
1411 struct takeover_callback_data {
1412 uint32_t num_nodes;
1413 unsigned int *fail_count;
1416 static struct takeover_callback_data *
1417 takeover_callback_data_init(TALLOC_CTX *mem_ctx,
1418 uint32_t num_nodes)
1420 static struct takeover_callback_data *takeover_data;
1422 takeover_data = talloc_zero(mem_ctx, struct takeover_callback_data);
1423 if (takeover_data == NULL) {
1424 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1425 return NULL;
1428 takeover_data->fail_count = talloc_zero_array(takeover_data,
1429 unsigned int, num_nodes);
1430 if (takeover_data->fail_count == NULL) {
1431 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
1432 talloc_free(takeover_data);
1433 return NULL;
1436 takeover_data->num_nodes = num_nodes;
1438 return takeover_data;
1441 static void takeover_run_fail_callback(struct ctdb_context *ctdb,
1442 uint32_t node_pnn, int32_t res,
1443 TDB_DATA outdata, void *callback_data)
1445 struct takeover_callback_data *cd =
1446 talloc_get_type_abort(callback_data,
1447 struct takeover_callback_data);
1449 if (node_pnn >= cd->num_nodes) {
1450 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
1451 return;
1454 if (cd->fail_count[node_pnn] == 0) {
1455 DEBUG(DEBUG_ERR,
1456 ("Node %u failed the takeover run\n", node_pnn));
1459 cd->fail_count[node_pnn]++;
1462 static void takeover_run_process_failures(struct ctdb_context *ctdb,
1463 struct takeover_callback_data *tcd)
1465 unsigned int max_fails = 0;
1466 uint32_t max_pnn = -1;
1467 uint32_t i;
1469 for (i = 0; i < tcd->num_nodes; i++) {
1470 if (tcd->fail_count[i] > max_fails) {
1471 max_pnn = i;
1472 max_fails = tcd->fail_count[i];
1476 if (max_fails > 0) {
1477 int ret;
1478 TDB_DATA data;
1480 DEBUG(DEBUG_ERR,
1481 ("Sending banning credits to %u with fail count %u\n",
1482 max_pnn, max_fails));
1484 data.dptr = (uint8_t *)&max_pnn;
1485 data.dsize = sizeof(uint32_t);
1486 ret = ctdb_client_send_message(ctdb,
1487 CTDB_BROADCAST_CONNECTED,
1488 CTDB_SRVID_BANNING,
1489 data);
1490 if (ret != 0) {
1491 DEBUG(DEBUG_ERR,
1492 ("Failed to set banning credits for node %u\n",
1493 max_pnn));
1499 * Recalculate the allocation of public IPs to nodes and have the
1500 * nodes host their allocated addresses.
1502 * - Initialise IP allocation state. Pass:
1503 + algorithm to be used;
1504 + whether IP rebalancing ("failback") should be done (this uses a
1505 cluster-wide configuration variable and only the value form the
1506 master node is used); and
1507 * + list of nodes to force rebalance (internal structure, currently
1508 * no way to fetch, only used by LCP2 for nodes that have had new
1509 * IP addresses added).
1510 * - Set IP flags for IP allocation based on node map and tunables
1511 * NoIPTakeover/NoIPHostOnAllDisabled from all connected nodes
1512 * (tunable fetching done separately so values can be faked in unit
1513 * testing)
1514 * - Retrieve known and available IP addresses (done separately so
1515 * values can be faked in unit testing)
1516 * - Use ipalloc_set_public_ips() to set known and available IP
1517 addresses for allocation
1518 * - If cluster can't host IP addresses then early exit
1519 * - Run IP allocation algorithm
1520 * - Send RELEASE_IP to all nodes for IPs they should not host
1521 * - Send TAKE_IP to all nodes for IPs they should host
1522 * - Send IPREALLOCATED to all nodes (with backward compatibility hack)
1524 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
1525 uint32_t *force_rebalance_nodes)
1527 int i, ret;
1528 struct ctdb_public_ip ip;
1529 uint32_t *nodes;
1530 struct public_ip_list *all_ips, *tmp_ip;
1531 TDB_DATA data;
1532 struct timeval timeout;
1533 struct client_async_data *async_data;
1534 struct ctdb_client_control_state *state;
1535 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1536 struct ipalloc_state *ipalloc_state;
1537 struct ctdb_public_ip_list *known_ips, *available_ips;
1538 struct takeover_callback_data *takeover_data;
1540 /* Initialise fail callback data to be used with
1541 * takeover_run_fail_callback(). A failure in any of the
1542 * following steps will cause an early return, so this can be
1543 * reused for each of those steps without re-initialising. */
1544 takeover_data = takeover_callback_data_init(tmp_ctx,
1545 nodemap->num);
1546 if (takeover_data == NULL) {
1547 talloc_free(tmp_ctx);
1548 return -1;
1551 /* Default timeout for early jump to IPREALLOCATED. See below
1552 * for explanation of 3 times... */
1553 timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout, 0);
1556 * ip failover is completely disabled, just send out the
1557 * ipreallocated event.
1559 if (ctdb->tunable.disable_ip_failover != 0) {
1560 goto ipreallocated;
1563 ipalloc_state = ipalloc_state_init(tmp_ctx, ctdb->num_nodes,
1564 determine_algorithm(&ctdb->tunable),
1565 (ctdb->tunable.no_ip_failback != 0),
1566 force_rebalance_nodes);
1567 if (ipalloc_state == NULL) {
1568 talloc_free(tmp_ctx);
1569 return -1;
1572 if (!set_ipflags(ctdb, ipalloc_state, nodemap)) {
1573 DEBUG(DEBUG_ERR,
1574 ("Failed to set IP flags - aborting takeover run\n"));
1575 talloc_free(tmp_ctx);
1576 return -1;
1579 /* Fetch known/available public IPs from each active node */
1580 /* Fetch lists of known public IPs from all nodes */
1581 known_ips = ctdb_fetch_remote_public_ips(ctdb, ipalloc_state,
1582 nodemap, 0);
1583 if (known_ips == NULL) {
1584 DEBUG(DEBUG_ERR, ("Failed to read known public IPs\n"));
1585 talloc_free(tmp_ctx);
1586 return -1;
1588 available_ips = ctdb_fetch_remote_public_ips(
1589 ctdb, ipalloc_state, nodemap,
1590 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE);
1591 if (available_ips == NULL) {
1592 DEBUG(DEBUG_ERR, ("Failed to read available public IPs\n"));
1593 talloc_free(tmp_ctx);
1594 return -1;
1597 if (! ipalloc_set_public_ips(ipalloc_state, known_ips, available_ips)) {
1598 DEBUG(DEBUG_ERR, ("Failed to set public IPs\n"));
1599 talloc_free(tmp_ctx);
1600 return -1;
1603 if (! ipalloc_can_host_ips(ipalloc_state)) {
1604 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
1605 goto ipreallocated;
1608 /* Do the IP reassignment calculations */
1609 all_ips = ipalloc(ipalloc_state);
1610 if (all_ips == NULL) {
1611 talloc_free(tmp_ctx);
1612 return -1;
1615 /* Now tell all nodes to release any public IPs should not
1616 * host. This will be a NOOP on nodes that don't currently
1617 * hold the given IP.
1619 async_data = talloc_zero(tmp_ctx, struct client_async_data);
1620 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1622 async_data->fail_callback = takeover_run_fail_callback;
1623 async_data->callback_data = takeover_data;
1625 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
1627 /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
1628 * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
1629 * seconds. However, RELEASE_IP can take longer due to TCP
1630 * connection killing, so sometimes needs more time.
1631 * Therefore, use a cumulative timeout of TakeoverTimeout * 3
1632 * seconds across all 3 stages. No explicit expiry checks are
1633 * needed before each stage because tevent is smart enough to
1634 * fire the timeouts even if they are in the past. Initialise
1635 * this here so it explicitly covers the stages we're
1636 * interested in but, in particular, not the time taken by the
1637 * ipalloc().
1639 timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout, 0);
1641 /* Send a RELEASE_IP to all nodes that should not be hosting
1642 * each IP. For each IP, all but one of these will be
1643 * redundant. However, the redundant ones are used to tell
1644 * nodes which node should be hosting the IP so that commands
1645 * like "ctdb ip" can display a particular nodes idea of who
1646 * is hosting what. */
1647 for (i=0;i<nodemap->num;i++) {
1648 /* don't talk to unconnected nodes, but do talk to banned nodes */
1649 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1650 continue;
1653 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1654 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1655 /* This node should be serving this
1656 vnn so don't tell it to release the ip
1658 continue;
1660 ip.pnn = tmp_ip->pnn;
1661 ip.addr = tmp_ip->addr;
1663 data.dsize = sizeof(ip);
1664 data.dptr = (uint8_t *)&ip;
1665 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1666 0, CTDB_CONTROL_RELEASE_IP, 0,
1667 data, async_data,
1668 &timeout, NULL);
1669 if (state == NULL) {
1670 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1671 talloc_free(tmp_ctx);
1672 return -1;
1675 ctdb_client_async_add(async_data, state);
1678 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1679 DEBUG(DEBUG_ERR,
1680 ("Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1681 goto fail;
1683 talloc_free(async_data);
1686 /* For each IP, send a TAKOVER_IP to the node that should be
1687 * hosting it. Many of these will often be redundant (since
1688 * the allocation won't have changed) but they can be useful
1689 * to recover from inconsistencies. */
1690 async_data = talloc_zero(tmp_ctx, struct client_async_data);
1691 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1693 async_data->fail_callback = takeover_run_fail_callback;
1694 async_data->callback_data = takeover_data;
1696 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1697 if (tmp_ip->pnn == -1) {
1698 /* this IP won't be taken over */
1699 continue;
1702 ip.pnn = tmp_ip->pnn;
1703 ip.addr = tmp_ip->addr;
1705 data.dsize = sizeof(ip);
1706 data.dptr = (uint8_t *)&ip;
1707 state = ctdb_control_send(ctdb, tmp_ip->pnn,
1708 0, CTDB_CONTROL_TAKEOVER_IP, 0,
1709 data, async_data, &timeout, NULL);
1710 if (state == NULL) {
1711 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1712 talloc_free(tmp_ctx);
1713 return -1;
1716 ctdb_client_async_add(async_data, state);
1718 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1719 DEBUG(DEBUG_ERR,
1720 ("Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1721 goto fail;
1724 ipreallocated:
1726 * Tell all nodes to run eventscripts to process the
1727 * "ipreallocated" event. This can do a lot of things,
1728 * including restarting services to reconfigure them if public
1729 * IPs have moved. Once upon a time this event only used to
1730 * update natgw.
1732 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1733 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
1734 nodes, 0, timeout,
1735 false, tdb_null,
1736 NULL, takeover_run_fail_callback,
1737 takeover_data);
1738 if (ret != 0) {
1739 DEBUG(DEBUG_ERR,
1740 ("Async CTDB_CONTROL_IPREALLOCATED control failed\n"));
1741 goto fail;
1744 talloc_free(tmp_ctx);
1745 return ret;
1747 fail:
1748 takeover_run_process_failures(ctdb, takeover_data);
1749 talloc_free(tmp_ctx);
1750 return -1;
1755 destroy a ctdb_client_ip structure
1757 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1759 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1760 ctdb_addr_to_str(&ip->addr),
1761 ntohs(ip->addr.ip.sin_port),
1762 ip->client_id));
1764 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1765 return 0;
1769 called by a client to inform us of a TCP connection that it is managing
1770 that should tickled with an ACK when IP takeover is done
1772 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1773 TDB_DATA indata)
1775 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
1776 struct ctdb_connection *tcp_sock = NULL;
1777 struct ctdb_tcp_list *tcp;
1778 struct ctdb_connection t;
1779 int ret;
1780 TDB_DATA data;
1781 struct ctdb_client_ip *ip;
1782 struct ctdb_vnn *vnn;
1783 ctdb_sock_addr addr;
1785 /* If we don't have public IPs, tickles are useless */
1786 if (ctdb->vnn == NULL) {
1787 return 0;
1790 tcp_sock = (struct ctdb_connection *)indata.dptr;
1792 addr = tcp_sock->src;
1793 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
1794 addr = tcp_sock->dst;
1795 ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
1797 ZERO_STRUCT(addr);
1798 memcpy(&addr, &tcp_sock->dst, sizeof(addr));
1799 vnn = find_public_ip_vnn(ctdb, &addr);
1800 if (vnn == NULL) {
1801 switch (addr.sa.sa_family) {
1802 case AF_INET:
1803 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1804 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
1805 ctdb_addr_to_str(&addr)));
1807 break;
1808 case AF_INET6:
1809 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
1810 ctdb_addr_to_str(&addr)));
1811 break;
1812 default:
1813 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1816 return 0;
1819 if (vnn->pnn != ctdb->pnn) {
1820 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1821 ctdb_addr_to_str(&addr),
1822 client_id, client->pid));
1823 /* failing this call will tell smbd to die */
1824 return -1;
1827 ip = talloc(client, struct ctdb_client_ip);
1828 CTDB_NO_MEMORY(ctdb, ip);
1830 ip->ctdb = ctdb;
1831 ip->addr = addr;
1832 ip->client_id = client_id;
1833 talloc_set_destructor(ip, ctdb_client_ip_destructor);
1834 DLIST_ADD(ctdb->client_ip_list, ip);
1836 tcp = talloc(client, struct ctdb_tcp_list);
1837 CTDB_NO_MEMORY(ctdb, tcp);
1839 tcp->connection.src = tcp_sock->src;
1840 tcp->connection.dst = tcp_sock->dst;
1842 DLIST_ADD(client->tcp_list, tcp);
1844 t.src = tcp_sock->src;
1845 t.dst = tcp_sock->dst;
1847 data.dptr = (uint8_t *)&t;
1848 data.dsize = sizeof(t);
1850 switch (addr.sa.sa_family) {
1851 case AF_INET:
1852 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1853 (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
1854 ctdb_addr_to_str(&tcp_sock->src),
1855 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1856 break;
1857 case AF_INET6:
1858 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1859 (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
1860 ctdb_addr_to_str(&tcp_sock->src),
1861 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1862 break;
1863 default:
1864 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1868 /* tell all nodes about this tcp connection */
1869 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
1870 CTDB_CONTROL_TCP_ADD,
1871 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1872 if (ret != 0) {
1873 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1874 return -1;
1877 return 0;
1881 find a tcp address on a list
1883 static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
1884 struct ctdb_connection *tcp)
1886 int i;
1888 if (array == NULL) {
1889 return NULL;
1892 for (i=0;i<array->num;i++) {
1893 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
1894 ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
1895 return &array->connections[i];
1898 return NULL;
1904 called by a daemon to inform us of a TCP connection that one of its
1905 clients managing that should tickled with an ACK when IP takeover is
1906 done
1908 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
1910 struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
1911 struct ctdb_tcp_array *tcparray;
1912 struct ctdb_connection tcp;
1913 struct ctdb_vnn *vnn;
1915 /* If we don't have public IPs, tickles are useless */
1916 if (ctdb->vnn == NULL) {
1917 return 0;
1920 vnn = find_public_ip_vnn(ctdb, &p->dst);
1921 if (vnn == NULL) {
1922 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1923 ctdb_addr_to_str(&p->dst)));
1925 return -1;
1929 tcparray = vnn->tcp_array;
1931 /* If this is the first tickle */
1932 if (tcparray == NULL) {
1933 tcparray = talloc(vnn, struct ctdb_tcp_array);
1934 CTDB_NO_MEMORY(ctdb, tcparray);
1935 vnn->tcp_array = tcparray;
1937 tcparray->num = 0;
1938 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
1939 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1941 tcparray->connections[tcparray->num].src = p->src;
1942 tcparray->connections[tcparray->num].dst = p->dst;
1943 tcparray->num++;
1945 if (tcp_update_needed) {
1946 vnn->tcp_update_needed = true;
1948 return 0;
1952 /* Do we already have this tickle ?*/
1953 tcp.src = p->src;
1954 tcp.dst = p->dst;
1955 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
1956 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1957 ctdb_addr_to_str(&tcp.dst),
1958 ntohs(tcp.dst.ip.sin_port),
1959 vnn->pnn));
1960 return 0;
1963 /* A new tickle, we must add it to the array */
1964 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1965 struct ctdb_connection,
1966 tcparray->num+1);
1967 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1969 tcparray->connections[tcparray->num].src = p->src;
1970 tcparray->connections[tcparray->num].dst = p->dst;
1971 tcparray->num++;
1973 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1974 ctdb_addr_to_str(&tcp.dst),
1975 ntohs(tcp.dst.ip.sin_port),
1976 vnn->pnn));
1978 if (tcp_update_needed) {
1979 vnn->tcp_update_needed = true;
1982 return 0;
1986 static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn)
1988 struct ctdb_connection *tcpp;
1990 if (vnn == NULL) {
1991 return;
1994 /* if the array is empty we cant remove it
1995 and we don't need to do anything
1997 if (vnn->tcp_array == NULL) {
1998 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1999 ctdb_addr_to_str(&conn->dst),
2000 ntohs(conn->dst.ip.sin_port)));
2001 return;
2005 /* See if we know this connection
2006 if we don't know this connection then we dont need to do anything
2008 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2009 if (tcpp == NULL) {
2010 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2011 ctdb_addr_to_str(&conn->dst),
2012 ntohs(conn->dst.ip.sin_port)));
2013 return;
2017 /* We need to remove this entry from the array.
2018 Instead of allocating a new array and copying data to it
2019 we cheat and just copy the last entry in the existing array
2020 to the entry that is to be removed and just shring the
2021 ->num field
2023 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2024 vnn->tcp_array->num--;
2026 /* If we deleted the last entry we also need to remove the entire array
2028 if (vnn->tcp_array->num == 0) {
2029 talloc_free(vnn->tcp_array);
2030 vnn->tcp_array = NULL;
2033 vnn->tcp_update_needed = true;
2035 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2036 ctdb_addr_to_str(&conn->src),
2037 ntohs(conn->src.ip.sin_port)));
2042 called by a daemon to inform us of a TCP connection that one of its
2043 clients used are no longer needed in the tickle database
2045 int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2047 struct ctdb_vnn *vnn;
2048 struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
2050 /* If we don't have public IPs, tickles are useless */
2051 if (ctdb->vnn == NULL) {
2052 return 0;
2055 vnn = find_public_ip_vnn(ctdb, &conn->dst);
2056 if (vnn == NULL) {
2057 DEBUG(DEBUG_ERR,
2058 (__location__ " unable to find public address %s\n",
2059 ctdb_addr_to_str(&conn->dst)));
2060 return 0;
2063 ctdb_remove_connection(vnn, conn);
2065 return 0;
2070 Called when another daemon starts - causes all tickles for all
2071 public addresses we are serving to be sent to the new node on the
2072 next check. This actually causes the next scheduled call to
2073 tdb_update_tcp_tickles() to update all nodes. This is simple and
2074 doesn't require careful error handling.
2076 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
2078 struct ctdb_vnn *vnn;
2080 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
2081 (unsigned long) pnn));
2083 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
2084 vnn->tcp_update_needed = true;
2087 return 0;
2092 called when a client structure goes away - hook to remove
2093 elements from the tcp_list in all daemons
2095 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2097 while (client->tcp_list) {
2098 struct ctdb_vnn *vnn;
2099 struct ctdb_tcp_list *tcp = client->tcp_list;
2100 struct ctdb_connection *conn = &tcp->connection;
2102 DLIST_REMOVE(client->tcp_list, tcp);
2104 vnn = find_public_ip_vnn(client->ctdb,
2105 &conn->dst);
2106 if (vnn == NULL) {
2107 DEBUG(DEBUG_ERR,
2108 (__location__ " unable to find public address %s\n",
2109 ctdb_addr_to_str(&conn->dst)));
2110 continue;
2113 /* If the IP address is hosted on this node then
2114 * remove the connection. */
2115 if (vnn->pnn == client->ctdb->pnn) {
2116 ctdb_remove_connection(vnn, conn);
2119 /* Otherwise this function has been called because the
2120 * server IP address has been released to another node
2121 * and the client has exited. This means that we
2122 * should not delete the connection information. The
2123 * takeover node processes connections too. */
2128 void ctdb_release_all_ips(struct ctdb_context *ctdb)
2130 struct ctdb_vnn *vnn, *next;
2131 int count = 0;
2133 if (ctdb->tunable.disable_ip_failover == 1) {
2134 return;
2137 for (vnn = ctdb->vnn; vnn != NULL; vnn = next) {
2138 /* vnn can be freed below in release_ip_post() */
2139 next = vnn->next;
2141 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2142 ctdb_vnn_unassign_iface(ctdb, vnn);
2143 continue;
2146 /* Don't allow multiple releases at once. Some code,
2147 * particularly ctdb_tickle_sentenced_connections() is
2148 * not re-entrant */
2149 if (vnn->update_in_flight) {
2150 DEBUG(DEBUG_WARNING,
2151 (__location__
2152 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
2153 ctdb_addr_to_str(&vnn->public_address),
2154 vnn->public_netmask_bits,
2155 ctdb_vnn_iface_string(vnn)));
2156 continue;
2158 vnn->update_in_flight = true;
2160 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
2161 ctdb_addr_to_str(&vnn->public_address),
2162 vnn->public_netmask_bits,
2163 ctdb_vnn_iface_string(vnn)));
2165 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2166 ctdb_vnn_iface_string(vnn),
2167 ctdb_addr_to_str(&vnn->public_address),
2168 vnn->public_netmask_bits);
2169 /* releaseip timeouts are converted to success, so to
2170 * detect failures just check if the IP address is
2171 * still there...
2173 if (ctdb_sys_have_ip(&vnn->public_address)) {
2174 DEBUG(DEBUG_ERR,
2175 (__location__
2176 " IP address %s not released\n",
2177 ctdb_addr_to_str(&vnn->public_address)));
2178 vnn->update_in_flight = false;
2179 continue;
2182 vnn = release_ip_post(ctdb, vnn, &vnn->public_address);
2183 if (vnn != NULL) {
2184 vnn->update_in_flight = false;
2186 count++;
2189 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
2194 get list of public IPs
2196 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
2197 struct ctdb_req_control_old *c, TDB_DATA *outdata)
2199 int i, num, len;
2200 struct ctdb_public_ip_list_old *ips;
2201 struct ctdb_vnn *vnn;
2202 bool only_available = false;
2204 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2205 only_available = true;
2208 /* count how many public ip structures we have */
2209 num = 0;
2210 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2211 num++;
2214 len = offsetof(struct ctdb_public_ip_list_old, ips) +
2215 num*sizeof(struct ctdb_public_ip);
2216 ips = talloc_zero_size(outdata, len);
2217 CTDB_NO_MEMORY(ctdb, ips);
2219 i = 0;
2220 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2221 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2222 continue;
2224 ips->ips[i].pnn = vnn->pnn;
2225 ips->ips[i].addr = vnn->public_address;
2226 i++;
2228 ips->num = i;
2229 len = offsetof(struct ctdb_public_ip_list_old, ips) +
2230 i*sizeof(struct ctdb_public_ip);
2232 outdata->dsize = len;
2233 outdata->dptr = (uint8_t *)ips;
2235 return 0;
2239 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2240 struct ctdb_req_control_old *c,
2241 TDB_DATA indata,
2242 TDB_DATA *outdata)
2244 int i, num, len;
2245 ctdb_sock_addr *addr;
2246 struct ctdb_public_ip_info_old *info;
2247 struct ctdb_vnn *vnn;
2249 addr = (ctdb_sock_addr *)indata.dptr;
2251 vnn = find_public_ip_vnn(ctdb, addr);
2252 if (vnn == NULL) {
2253 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2254 "'%s'not a public address\n",
2255 ctdb_addr_to_str(addr)));
2256 return -1;
2259 /* count how many public ip structures we have */
2260 num = 0;
2261 for (;vnn->ifaces[num];) {
2262 num++;
2265 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2266 num*sizeof(struct ctdb_iface);
2267 info = talloc_zero_size(outdata, len);
2268 CTDB_NO_MEMORY(ctdb, info);
2270 info->ip.addr = vnn->public_address;
2271 info->ip.pnn = vnn->pnn;
2272 info->active_idx = 0xFFFFFFFF;
2274 for (i=0; vnn->ifaces[i]; i++) {
2275 struct ctdb_interface *cur;
2277 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2278 if (cur == NULL) {
2279 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2280 vnn->ifaces[i]));
2281 return -1;
2283 if (vnn->iface == cur) {
2284 info->active_idx = i;
2286 strncpy(info->ifaces[i].name, cur->name,
2287 sizeof(info->ifaces[i].name));
2288 info->ifaces[i].name[sizeof(info->ifaces[i].name)-1] = '\0';
2289 info->ifaces[i].link_state = cur->link_up;
2290 info->ifaces[i].references = cur->references;
2292 info->num = i;
2293 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2294 i*sizeof(struct ctdb_iface);
2296 outdata->dsize = len;
2297 outdata->dptr = (uint8_t *)info;
2299 return 0;
2302 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2303 struct ctdb_req_control_old *c,
2304 TDB_DATA *outdata)
2306 int i, num, len;
2307 struct ctdb_iface_list_old *ifaces;
2308 struct ctdb_interface *cur;
2310 /* count how many public ip structures we have */
2311 num = 0;
2312 for (cur=ctdb->ifaces;cur;cur=cur->next) {
2313 num++;
2316 len = offsetof(struct ctdb_iface_list_old, ifaces) +
2317 num*sizeof(struct ctdb_iface);
2318 ifaces = talloc_zero_size(outdata, len);
2319 CTDB_NO_MEMORY(ctdb, ifaces);
2321 i = 0;
2322 for (cur=ctdb->ifaces;cur;cur=cur->next) {
2323 strncpy(ifaces->ifaces[i].name, cur->name,
2324 sizeof(ifaces->ifaces[i].name));
2325 ifaces->ifaces[i].name[sizeof(ifaces->ifaces[i].name)-1] = '\0';
2326 ifaces->ifaces[i].link_state = cur->link_up;
2327 ifaces->ifaces[i].references = cur->references;
2328 i++;
2330 ifaces->num = i;
2331 len = offsetof(struct ctdb_iface_list_old, ifaces) +
2332 i*sizeof(struct ctdb_iface);
2334 outdata->dsize = len;
2335 outdata->dptr = (uint8_t *)ifaces;
2337 return 0;
2340 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2341 struct ctdb_req_control_old *c,
2342 TDB_DATA indata)
2344 struct ctdb_iface *info;
2345 struct ctdb_interface *iface;
2346 bool link_up = false;
2348 info = (struct ctdb_iface *)indata.dptr;
2350 if (info->name[CTDB_IFACE_SIZE] != '\0') {
2351 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2352 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2353 len, len, info->name));
2354 return -1;
2357 switch (info->link_state) {
2358 case 0:
2359 link_up = false;
2360 break;
2361 case 1:
2362 link_up = true;
2363 break;
2364 default:
2365 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2366 (unsigned int)info->link_state));
2367 return -1;
2370 if (info->references != 0) {
2371 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2372 (unsigned int)info->references));
2373 return -1;
2376 iface = ctdb_find_iface(ctdb, info->name);
2377 if (iface == NULL) {
2378 return -1;
2381 if (link_up == iface->link_up) {
2382 return 0;
2385 DEBUG(DEBUG_ERR,
2386 ("iface[%s] has changed it's link status %s => %s\n",
2387 iface->name,
2388 iface->link_up?"up":"down",
2389 link_up?"up":"down"));
2391 iface->link_up = link_up;
2392 return 0;
2397 called by a daemon to inform us of the entire list of TCP tickles for
2398 a particular public address.
2399 this control should only be sent by the node that is currently serving
2400 that public address.
2402 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2404 struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
2405 struct ctdb_tcp_array *tcparray;
2406 struct ctdb_vnn *vnn;
2408 /* We must at least have tickles.num or else we cant verify the size
2409 of the received data blob
2411 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
2412 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
2413 return -1;
2416 /* verify that the size of data matches what we expect */
2417 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
2418 + sizeof(struct ctdb_connection) * list->num) {
2419 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
2420 return -1;
2423 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
2424 ctdb_addr_to_str(&list->addr)));
2426 vnn = find_public_ip_vnn(ctdb, &list->addr);
2427 if (vnn == NULL) {
2428 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
2429 ctdb_addr_to_str(&list->addr)));
2431 return 1;
2434 if (vnn->pnn == ctdb->pnn) {
2435 DEBUG(DEBUG_INFO,
2436 ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n",
2437 ctdb_addr_to_str(&list->addr)));
2438 return 0;
2441 /* remove any old ticklelist we might have */
2442 talloc_free(vnn->tcp_array);
2443 vnn->tcp_array = NULL;
2445 tcparray = talloc(vnn, struct ctdb_tcp_array);
2446 CTDB_NO_MEMORY(ctdb, tcparray);
2448 tcparray->num = list->num;
2450 tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
2451 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2453 memcpy(tcparray->connections, &list->connections[0],
2454 sizeof(struct ctdb_connection)*tcparray->num);
2456 /* We now have a new fresh tickle list array for this vnn */
2457 vnn->tcp_array = tcparray;
2459 return 0;
2463 called to return the full list of tickles for the puclic address associated
2464 with the provided vnn
2466 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
2468 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
2469 struct ctdb_tickle_list_old *list;
2470 struct ctdb_tcp_array *tcparray;
2471 int num, i;
2472 struct ctdb_vnn *vnn;
2473 unsigned port;
2475 vnn = find_public_ip_vnn(ctdb, addr);
2476 if (vnn == NULL) {
2477 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
2478 ctdb_addr_to_str(addr)));
2480 return 1;
2483 port = ctdb_addr_to_port(addr);
2485 tcparray = vnn->tcp_array;
2486 num = 0;
2487 if (tcparray != NULL) {
2488 if (port == 0) {
2489 /* All connections */
2490 num = tcparray->num;
2491 } else {
2492 /* Count connections for port */
2493 for (i = 0; i < tcparray->num; i++) {
2494 if (port == ctdb_addr_to_port(&tcparray->connections[i].dst)) {
2495 num++;
2501 outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
2502 + sizeof(struct ctdb_connection) * num;
2504 outdata->dptr = talloc_size(outdata, outdata->dsize);
2505 CTDB_NO_MEMORY(ctdb, outdata->dptr);
2506 list = (struct ctdb_tickle_list_old *)outdata->dptr;
2508 list->addr = *addr;
2509 list->num = num;
2511 if (num == 0) {
2512 return 0;
2515 num = 0;
2516 for (i = 0; i < tcparray->num; i++) {
2517 if (port == 0 || \
2518 port == ctdb_addr_to_port(&tcparray->connections[i].dst)) {
2519 list->connections[num] = tcparray->connections[i];
2520 num++;
2524 return 0;
2529 set the list of all tcp tickles for a public address
2531 static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
2532 ctdb_sock_addr *addr,
2533 struct ctdb_tcp_array *tcparray)
2535 int ret, num;
2536 TDB_DATA data;
2537 struct ctdb_tickle_list_old *list;
2539 if (tcparray) {
2540 num = tcparray->num;
2541 } else {
2542 num = 0;
2545 data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
2546 sizeof(struct ctdb_connection) * num;
2547 data.dptr = talloc_size(ctdb, data.dsize);
2548 CTDB_NO_MEMORY(ctdb, data.dptr);
2550 list = (struct ctdb_tickle_list_old *)data.dptr;
2551 list->addr = *addr;
2552 list->num = num;
2553 if (tcparray) {
2554 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
2557 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
2558 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
2559 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2560 if (ret != 0) {
2561 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
2562 return -1;
2565 talloc_free(data.dptr);
2567 return ret;
2572 perform tickle updates if required
2574 static void ctdb_update_tcp_tickles(struct tevent_context *ev,
2575 struct tevent_timer *te,
2576 struct timeval t, void *private_data)
2578 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2579 int ret;
2580 struct ctdb_vnn *vnn;
2582 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2583 /* we only send out updates for public addresses that
2584 we have taken over
2586 if (ctdb->pnn != vnn->pnn) {
2587 continue;
2589 /* We only send out the updates if we need to */
2590 if (!vnn->tcp_update_needed) {
2591 continue;
2593 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
2594 &vnn->public_address,
2595 vnn->tcp_array);
2596 if (ret != 0) {
2597 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
2598 ctdb_addr_to_str(&vnn->public_address)));
2599 } else {
2600 DEBUG(DEBUG_INFO,
2601 ("Sent tickle update for public address %s\n",
2602 ctdb_addr_to_str(&vnn->public_address)));
2603 vnn->tcp_update_needed = false;
2607 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
2608 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
2609 ctdb_update_tcp_tickles, ctdb);
2613 start periodic update of tcp tickles
2615 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2617 ctdb->tickle_update_context = talloc_new(ctdb);
2619 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
2620 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
2621 ctdb_update_tcp_tickles, ctdb);
2627 struct control_gratious_arp {
2628 struct ctdb_context *ctdb;
2629 ctdb_sock_addr addr;
2630 const char *iface;
2631 int count;
2635 send a control_gratuitous arp
2637 static void send_gratious_arp(struct tevent_context *ev,
2638 struct tevent_timer *te,
2639 struct timeval t, void *private_data)
2641 int ret;
2642 struct control_gratious_arp *arp = talloc_get_type(private_data,
2643 struct control_gratious_arp);
2645 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2646 if (ret != 0) {
2647 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
2648 arp->iface, strerror(errno)));
2652 arp->count++;
2653 if (arp->count == CTDB_ARP_REPEAT) {
2654 talloc_free(arp);
2655 return;
2658 tevent_add_timer(arp->ctdb->ev, arp,
2659 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
2660 send_gratious_arp, arp);
2665 send a gratious arp
2667 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2669 struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
2670 struct control_gratious_arp *arp;
2672 /* verify the size of indata */
2673 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
2674 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
2675 (unsigned)indata.dsize,
2676 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
2677 return -1;
2679 if (indata.dsize !=
2680 ( offsetof(struct ctdb_addr_info_old, iface)
2681 + gratious_arp->len ) ){
2683 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2684 "but should be %u bytes\n",
2685 (unsigned)indata.dsize,
2686 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
2687 return -1;
2691 arp = talloc(ctdb, struct control_gratious_arp);
2692 CTDB_NO_MEMORY(ctdb, arp);
2694 arp->ctdb = ctdb;
2695 arp->addr = gratious_arp->addr;
2696 arp->iface = talloc_strdup(arp, gratious_arp->iface);
2697 CTDB_NO_MEMORY(ctdb, arp->iface);
2698 arp->count = 0;
2700 tevent_add_timer(arp->ctdb->ev, arp,
2701 timeval_zero(), send_gratious_arp, arp);
2703 return 0;
2706 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2708 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
2709 int ret;
2711 /* verify the size of indata */
2712 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
2713 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
2714 return -1;
2716 if (indata.dsize !=
2717 ( offsetof(struct ctdb_addr_info_old, iface)
2718 + pub->len ) ){
2720 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2721 "but should be %u bytes\n",
2722 (unsigned)indata.dsize,
2723 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
2724 return -1;
2727 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
2729 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
2731 if (ret != 0) {
2732 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2733 return -1;
2736 return 0;
2739 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2741 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
2742 struct ctdb_vnn *vnn;
2744 /* verify the size of indata */
2745 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
2746 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
2747 return -1;
2749 if (indata.dsize !=
2750 ( offsetof(struct ctdb_addr_info_old, iface)
2751 + pub->len ) ){
2753 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2754 "but should be %u bytes\n",
2755 (unsigned)indata.dsize,
2756 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
2757 return -1;
2760 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
2762 /* walk over all public addresses until we find a match */
2763 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2764 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2765 if (vnn->pnn == ctdb->pnn) {
2766 /* This IP is currently being hosted.
2767 * Defer the deletion until the next
2768 * takeover run. "ctdb reloadips" will
2769 * always cause a takeover run. "ctdb
2770 * delip" will now need an explicit
2771 * "ctdb ipreallocated" afterwards. */
2772 vnn->delete_pending = true;
2773 } else {
2774 /* This IP is not hosted on the
2775 * current node so just delete it
2776 * now. */
2777 do_delete_ip(ctdb, vnn);
2780 return 0;
2784 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
2785 ctdb_addr_to_str(&pub->addr)));
2786 return -1;
2790 struct ipreallocated_callback_state {
2791 struct ctdb_req_control_old *c;
2794 static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
2795 int status, void *p)
2797 struct ipreallocated_callback_state *state =
2798 talloc_get_type(p, struct ipreallocated_callback_state);
2800 if (status != 0) {
2801 DEBUG(DEBUG_ERR,
2802 (" \"ipreallocated\" event script failed (status %d)\n",
2803 status));
2804 if (status == -ETIME) {
2805 ctdb_ban_self(ctdb);
2809 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
2810 talloc_free(state);
2813 /* A control to run the ipreallocated event */
2814 int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
2815 struct ctdb_req_control_old *c,
2816 bool *async_reply)
2818 int ret;
2819 struct ipreallocated_callback_state *state;
2821 state = talloc(ctdb, struct ipreallocated_callback_state);
2822 CTDB_NO_MEMORY(ctdb, state);
2824 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
2826 ret = ctdb_event_script_callback(ctdb, state,
2827 ctdb_ipreallocated_callback, state,
2828 CTDB_EVENT_IPREALLOCATED,
2829 "%s", "");
2831 if (ret != 0) {
2832 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
2833 talloc_free(state);
2834 return -1;
2837 /* tell the control that we will be reply asynchronously */
2838 state->c = talloc_steal(state, c);
2839 *async_reply = true;
2841 return 0;
2845 struct ctdb_reloadips_handle {
2846 struct ctdb_context *ctdb;
2847 struct ctdb_req_control_old *c;
2848 int status;
2849 int fd[2];
2850 pid_t child;
2851 struct tevent_fd *fde;
2854 static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
2856 if (h == h->ctdb->reload_ips) {
2857 h->ctdb->reload_ips = NULL;
2859 if (h->c != NULL) {
2860 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
2861 h->c = NULL;
2863 ctdb_kill(h->ctdb, h->child, SIGKILL);
2864 return 0;
2867 static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
2868 struct tevent_timer *te,
2869 struct timeval t, void *private_data)
2871 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
2873 talloc_free(h);
2876 static void ctdb_reloadips_child_handler(struct tevent_context *ev,
2877 struct tevent_fd *fde,
2878 uint16_t flags, void *private_data)
2880 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
2882 char res;
2883 int ret;
2885 ret = sys_read(h->fd[0], &res, 1);
2886 if (ret < 1 || res != 0) {
2887 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
2888 res = 1;
2890 h->status = res;
2892 talloc_free(h);
2895 static int ctdb_reloadips_child(struct ctdb_context *ctdb)
2897 TALLOC_CTX *mem_ctx = talloc_new(NULL);
2898 struct ctdb_public_ip_list_old *ips;
2899 struct ctdb_vnn *vnn;
2900 struct client_async_data *async_data;
2901 struct timeval timeout;
2902 TDB_DATA data;
2903 struct ctdb_client_control_state *state;
2904 bool first_add;
2905 int i, ret;
2907 CTDB_NO_MEMORY(ctdb, mem_ctx);
2909 /* Read IPs from local node */
2910 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
2911 CTDB_CURRENT_NODE, mem_ctx, &ips);
2912 if (ret != 0) {
2913 DEBUG(DEBUG_ERR,
2914 ("Unable to fetch public IPs from local node\n"));
2915 talloc_free(mem_ctx);
2916 return -1;
2919 /* Read IPs file - this is safe since this is a child process */
2920 ctdb->vnn = NULL;
2921 if (ctdb_set_public_addresses(ctdb, false) != 0) {
2922 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
2923 talloc_free(mem_ctx);
2924 return -1;
2927 async_data = talloc_zero(mem_ctx, struct client_async_data);
2928 CTDB_NO_MEMORY(ctdb, async_data);
2930 /* Compare IPs between node and file for IPs to be deleted */
2931 for (i = 0; i < ips->num; i++) {
2932 /* */
2933 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
2934 if (ctdb_same_ip(&vnn->public_address,
2935 &ips->ips[i].addr)) {
2936 /* IP is still in file */
2937 break;
2941 if (vnn == NULL) {
2942 /* Delete IP ips->ips[i] */
2943 struct ctdb_addr_info_old *pub;
2945 DEBUG(DEBUG_NOTICE,
2946 ("IP %s no longer configured, deleting it\n",
2947 ctdb_addr_to_str(&ips->ips[i].addr)));
2949 pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
2950 CTDB_NO_MEMORY(ctdb, pub);
2952 pub->addr = ips->ips[i].addr;
2953 pub->mask = 0;
2954 pub->len = 0;
2956 timeout = TAKEOVER_TIMEOUT();
2958 data.dsize = offsetof(struct ctdb_addr_info_old,
2959 iface) + pub->len;
2960 data.dptr = (uint8_t *)pub;
2962 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
2963 CTDB_CONTROL_DEL_PUBLIC_IP,
2964 0, data, async_data,
2965 &timeout, NULL);
2966 if (state == NULL) {
2967 DEBUG(DEBUG_ERR,
2968 (__location__
2969 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
2970 goto failed;
2973 ctdb_client_async_add(async_data, state);
2977 /* Compare IPs between node and file for IPs to be added */
2978 first_add = true;
2979 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
2980 for (i = 0; i < ips->num; i++) {
2981 if (ctdb_same_ip(&vnn->public_address,
2982 &ips->ips[i].addr)) {
2983 /* IP already on node */
2984 break;
2987 if (i == ips->num) {
2988 /* Add IP ips->ips[i] */
2989 struct ctdb_addr_info_old *pub;
2990 const char *ifaces = NULL;
2991 uint32_t len;
2992 int iface = 0;
2994 DEBUG(DEBUG_NOTICE,
2995 ("New IP %s configured, adding it\n",
2996 ctdb_addr_to_str(&vnn->public_address)));
2997 if (first_add) {
2998 uint32_t pnn = ctdb_get_pnn(ctdb);
3000 data.dsize = sizeof(pnn);
3001 data.dptr = (uint8_t *)&pnn;
3003 ret = ctdb_client_send_message(
3004 ctdb,
3005 CTDB_BROADCAST_CONNECTED,
3006 CTDB_SRVID_REBALANCE_NODE,
3007 data);
3008 if (ret != 0) {
3009 DEBUG(DEBUG_WARNING,
3010 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
3013 first_add = false;
3016 ifaces = vnn->ifaces[0];
3017 iface = 1;
3018 while (vnn->ifaces[iface] != NULL) {
3019 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
3020 vnn->ifaces[iface]);
3021 iface++;
3024 len = strlen(ifaces) + 1;
3025 pub = talloc_zero_size(mem_ctx,
3026 offsetof(struct ctdb_addr_info_old, iface) + len);
3027 CTDB_NO_MEMORY(ctdb, pub);
3029 pub->addr = vnn->public_address;
3030 pub->mask = vnn->public_netmask_bits;
3031 pub->len = len;
3032 memcpy(&pub->iface[0], ifaces, pub->len);
3034 timeout = TAKEOVER_TIMEOUT();
3036 data.dsize = offsetof(struct ctdb_addr_info_old,
3037 iface) + pub->len;
3038 data.dptr = (uint8_t *)pub;
3040 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
3041 CTDB_CONTROL_ADD_PUBLIC_IP,
3042 0, data, async_data,
3043 &timeout, NULL);
3044 if (state == NULL) {
3045 DEBUG(DEBUG_ERR,
3046 (__location__
3047 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
3048 goto failed;
3051 ctdb_client_async_add(async_data, state);
3055 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
3056 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
3057 goto failed;
3060 talloc_free(mem_ctx);
3061 return 0;
3063 failed:
3064 talloc_free(mem_ctx);
3065 return -1;
3068 /* This control is sent to force the node to re-read the public addresses file
3069 and drop any addresses we should nnot longer host, and add new addresses
3070 that we are now able to host
3072 int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
3074 struct ctdb_reloadips_handle *h;
3075 pid_t parent = getpid();
3077 if (ctdb->reload_ips != NULL) {
3078 talloc_free(ctdb->reload_ips);
3079 ctdb->reload_ips = NULL;
3082 h = talloc(ctdb, struct ctdb_reloadips_handle);
3083 CTDB_NO_MEMORY(ctdb, h);
3084 h->ctdb = ctdb;
3085 h->c = NULL;
3086 h->status = -1;
3088 if (pipe(h->fd) == -1) {
3089 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
3090 talloc_free(h);
3091 return -1;
3094 h->child = ctdb_fork(ctdb);
3095 if (h->child == (pid_t)-1) {
3096 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
3097 close(h->fd[0]);
3098 close(h->fd[1]);
3099 talloc_free(h);
3100 return -1;
3103 /* child process */
3104 if (h->child == 0) {
3105 signed char res = 0;
3107 close(h->fd[0]);
3108 debug_extra = talloc_asprintf(NULL, "reloadips:");
3110 prctl_set_comment("ctdb_reloadips");
3111 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
3112 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
3113 res = -1;
3114 } else {
3115 res = ctdb_reloadips_child(ctdb);
3116 if (res != 0) {
3117 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
3121 sys_write(h->fd[1], &res, 1);
3122 ctdb_wait_for_process_to_exit(parent);
3123 _exit(0);
3126 h->c = talloc_steal(h, c);
3128 close(h->fd[1]);
3129 set_close_on_exec(h->fd[0]);
3131 talloc_set_destructor(h, ctdb_reloadips_destructor);
3134 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
3135 ctdb_reloadips_child_handler, (void *)h);
3136 tevent_fd_set_auto_close(h->fde);
3138 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
3139 ctdb_reloadips_timeout_event, h);
3141 /* we reply later */
3142 *async_reply = true;
3143 return 0;