ctdb-takeover: Use CTDB_CONTROL_START_IPREALLOCATE
[Samba.git] / ctdb / server / ctdb_takeover_helper.c
blob9bc84c8807864176f35f8e2779994a47b616482e
1 /*
2 CTDB IP takeover helper
4 Copyright (C) Martin Schwenke 2016
6 Based on ctdb_recovery_helper.c
7 Copyright (C) Amitay Isaacs 2015
9 and ctdb_takeover.c
10 Copyright (C) Ronnie Sahlberg 2007
11 Copyright (C) Andrew Tridgell 2007
12 Copyright (C) Martin Schwenke 2011
14 This program is free software; you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation; either version 3 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, see <http://www.gnu.org/licenses/>.
28 #include "replace.h"
29 #include "system/network.h"
30 #include "system/filesys.h"
32 #include <popt.h>
33 #include <talloc.h>
34 #include <tevent.h>
36 #include "lib/util/debug.h"
37 #include "lib/util/strv.h"
38 #include "lib/util/strv_util.h"
39 #include "lib/util/sys_rw.h"
40 #include "lib/util/time.h"
41 #include "lib/util/tevent_unix.h"
43 #include "protocol/protocol.h"
44 #include "protocol/protocol_api.h"
45 #include "protocol/protocol_util.h"
46 #include "client/client.h"
48 #include "common/logging.h"
50 #include "server/ipalloc.h"
52 static int takeover_timeout = 9;
54 #define TIMEOUT() timeval_current_ofs(takeover_timeout, 0)
57 * Utility functions
60 static bool generic_recv(struct tevent_req *req, int *perr)
62 int err;
64 if (tevent_req_is_unix_error(req, &err)) {
65 if (perr != NULL) {
66 *perr = err;
68 return false;
71 return true;
74 static enum ipalloc_algorithm
75 determine_algorithm(const struct ctdb_tunable_list *tunables)
77 switch (tunables->ip_alloc_algorithm) {
78 case 0:
79 return IPALLOC_DETERMINISTIC;
80 case 1:
81 return IPALLOC_NONDETERMINISTIC;
82 case 2:
83 return IPALLOC_LCP2;
84 default:
85 return IPALLOC_LCP2;
89 /**********************************************************************/
91 struct get_public_ips_state {
92 uint32_t *pnns;
93 int count;
94 struct ctdb_public_ip_list *ips;
95 uint32_t *ban_credits;
98 static void get_public_ips_done(struct tevent_req *subreq);
100 static struct tevent_req *get_public_ips_send(
101 TALLOC_CTX *mem_ctx,
102 struct tevent_context *ev,
103 struct ctdb_client_context *client,
104 uint32_t *pnns,
105 int count, int num_nodes,
106 uint32_t *ban_credits,
107 bool available_only)
109 struct tevent_req *req, *subreq;
110 struct get_public_ips_state *state;
111 struct ctdb_req_control request;
113 req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state);
114 if (req == NULL) {
115 return NULL;
118 state->pnns = pnns;
119 state->count = count;
120 state->ban_credits = ban_credits;
122 state->ips = talloc_zero_array(state,
123 struct ctdb_public_ip_list,
124 num_nodes);
125 if (tevent_req_nomem(state->ips, req)) {
126 return tevent_req_post(req, ev);
129 /* Short circuit if no nodes being asked for IPs */
130 if (state->count == 0) {
131 tevent_req_done(req);
132 return tevent_req_post(req, ev);
135 ctdb_req_control_get_public_ips(&request, available_only);
136 subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
137 state->pnns,
138 state->count,
139 TIMEOUT(), &request);
140 if (tevent_req_nomem(subreq, req)) {
141 return tevent_req_post(req, ev);
143 tevent_req_set_callback(subreq, get_public_ips_done, req);
145 return req;
148 static void get_public_ips_done(struct tevent_req *subreq)
150 struct tevent_req *req = tevent_req_callback_data(
151 subreq, struct tevent_req);
152 struct get_public_ips_state *state = tevent_req_data(
153 req, struct get_public_ips_state);
154 struct ctdb_reply_control **reply;
155 int *err_list;
156 int ret, i;
157 bool status, found_errors;
159 status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
160 &reply);
161 TALLOC_FREE(subreq);
162 if (! status) {
163 for (i = 0; i < state->count; i++) {
164 if (err_list[i] != 0) {
165 uint32_t pnn = state->pnns[i];
167 D_ERR("control GET_PUBLIC_IPS failed on "
168 "node %u, ret=%d\n", pnn, err_list[i]);
170 state->ban_credits[pnn]++;
174 tevent_req_error(req, ret);
175 return;
178 found_errors = false;
179 for (i = 0; i < state->count; i++) {
180 uint32_t pnn;
181 struct ctdb_public_ip_list *ips;
183 pnn = state->pnns[i];
184 ret = ctdb_reply_control_get_public_ips(reply[i], state->ips,
185 &ips);
186 if (ret != 0) {
187 D_ERR("control GET_PUBLIC_IPS failed on "
188 "node %u\n", pnn);
189 state->ban_credits[pnn]++;
190 found_errors = true;
191 continue;
194 D_INFO("Fetched public IPs from node %u\n", pnn);
195 state->ips[pnn] = *ips;
198 if (found_errors) {
199 tevent_req_error(req, EIO);
200 return;
203 talloc_free(reply);
205 tevent_req_done(req);
208 static bool get_public_ips_recv(struct tevent_req *req, int *perr,
209 TALLOC_CTX *mem_ctx,
210 struct ctdb_public_ip_list **ips)
212 struct get_public_ips_state *state = tevent_req_data(
213 req, struct get_public_ips_state);
214 int err;
216 if (tevent_req_is_unix_error(req, &err)) {
217 if (perr != NULL) {
218 *perr = err;
220 return false;
223 *ips = talloc_steal(mem_ctx, state->ips);
225 return true;
228 /**********************************************************************/
230 struct release_ip_state {
231 int num_sent;
232 int num_replies;
233 int num_fails;
234 int err_any;
235 uint32_t *ban_credits;
238 struct release_ip_one_state {
239 struct tevent_req *req;
240 uint32_t *pnns;
241 int count;
242 const char *ip_str;
245 static void release_ip_done(struct tevent_req *subreq);
247 static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
248 struct tevent_context *ev,
249 struct ctdb_client_context *client,
250 uint32_t *pnns,
251 int count,
252 struct timeval timeout,
253 struct public_ip_list *all_ips,
254 uint32_t *ban_credits)
256 struct tevent_req *req, *subreq;
257 struct release_ip_state *state;
258 struct ctdb_req_control request;
259 struct public_ip_list *tmp_ip;
261 req = tevent_req_create(mem_ctx, &state, struct release_ip_state);
262 if (req == NULL) {
263 return NULL;
266 state->num_sent = 0;
267 state->num_replies = 0;
268 state->num_fails = 0;
269 state->ban_credits = ban_credits;
271 /* Send a RELEASE_IP to all nodes that should not be hosting
272 * each IP. For each IP, all but one of these will be
273 * redundant. However, the redundant ones are used to tell
274 * nodes which node should be hosting the IP so that commands
275 * like "ctdb ip" can display a particular nodes idea of who
276 * is hosting what. */
277 for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
278 struct release_ip_one_state *substate;
279 struct ctdb_public_ip ip;
280 int i;
282 substate = talloc_zero(state, struct release_ip_one_state);
283 if (tevent_req_nomem(substate, req)) {
284 return tevent_req_post(req, ev);
287 substate->pnns = talloc_zero_array(substate, uint32_t, count);
288 if (tevent_req_nomem(substate->pnns, req)) {
289 return tevent_req_post(req, ev);
292 substate->count = 0;
293 substate->req = req;
295 substate->ip_str = ctdb_sock_addr_to_string(substate,
296 &tmp_ip->addr,
297 false);
298 if (tevent_req_nomem(substate->ip_str, req)) {
299 return tevent_req_post(req, ev);
302 for (i = 0; i < count; i++) {
303 uint32_t pnn = pnns[i];
305 /* Skip this node if IP is not known */
306 if (! bitmap_query(tmp_ip->known_on, pnn)) {
307 continue;
310 /* If pnn is not the node that should be
311 * hosting the IP then add it to the list of
312 * nodes that need to do a release. */
313 if (tmp_ip->pnn != pnn) {
314 substate->pnns[substate->count] = pnn;
315 substate->count++;
319 if (substate->count == 0) {
320 /* No releases to send for this address... */
321 TALLOC_FREE(substate);
322 continue;
325 ip.pnn = tmp_ip->pnn;
326 ip.addr = tmp_ip->addr;
327 ctdb_req_control_release_ip(&request, &ip);
328 subreq = ctdb_client_control_multi_send(state, ev, client,
329 substate->pnns,
330 substate->count,
331 timeout,/* cumulative */
332 &request);
333 if (tevent_req_nomem(subreq, req)) {
334 return tevent_req_post(req, ev);
336 tevent_req_set_callback(subreq, release_ip_done, substate);
338 state->num_sent++;
341 /* None sent, finished... */
342 if (state->num_sent == 0) {
343 tevent_req_done(req);
344 return tevent_req_post(req, ev);
347 return req;
350 static void release_ip_done(struct tevent_req *subreq)
352 struct release_ip_one_state *substate = tevent_req_callback_data(
353 subreq, struct release_ip_one_state);
354 struct tevent_req *req = substate->req;
355 struct release_ip_state *state = tevent_req_data(
356 req, struct release_ip_state);
357 int ret, i;
358 int *err_list;
359 bool status, found_errors;
361 status = ctdb_client_control_multi_recv(subreq, &ret, state,
362 &err_list, NULL);
363 TALLOC_FREE(subreq);
365 if (status) {
366 D_INFO("RELEASE_IP %s succeeded on %d nodes\n",
367 substate->ip_str, substate->count);
368 goto done;
371 /* Get some clear error messages out of err_list and count
372 * banning credits
374 found_errors = false;
375 for (i = 0; i < substate->count; i++) {
376 int err = err_list[i];
377 if (err != 0) {
378 uint32_t pnn = substate->pnns[i];
380 D_ERR("RELEASE_IP %s failed on node %u, "
381 "ret=%d\n", substate->ip_str, pnn, err);
383 state->ban_credits[pnn]++;
384 state->err_any = err;
385 found_errors = true;
388 if (! found_errors) {
389 D_ERR("RELEASE_IP %s internal error, ret=%d\n",
390 substate->ip_str, ret);
391 state->err_any = EIO;
394 state->num_fails++;
396 done:
397 talloc_free(substate);
399 state->num_replies++;
401 if (state->num_replies < state->num_sent) {
402 /* Not all replies received, don't go further */
403 return;
406 if (state->num_fails > 0) {
407 tevent_req_error(req, state->err_any);
408 return;
411 tevent_req_done(req);
414 static bool release_ip_recv(struct tevent_req *req, int *perr)
416 return generic_recv(req, perr);
419 /**********************************************************************/
421 struct take_ip_state {
422 int num_sent;
423 int num_replies;
424 int num_fails;
425 int err_any;
426 uint32_t *ban_credits;
429 struct take_ip_one_state {
430 struct tevent_req *req;
431 uint32_t pnn;
432 const char *ip_str;
435 static void take_ip_done(struct tevent_req *subreq);
437 static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx,
438 struct tevent_context *ev,
439 struct ctdb_client_context *client,
440 struct timeval timeout,
441 struct public_ip_list *all_ips,
442 uint32_t *ban_credits)
444 struct tevent_req *req, *subreq;
445 struct take_ip_state *state;
446 struct ctdb_req_control request;
447 struct public_ip_list *tmp_ip;
449 req = tevent_req_create(mem_ctx, &state, struct take_ip_state);
450 if (req == NULL) {
451 return NULL;
454 state->num_sent = 0;
455 state->num_replies = 0;
456 state->num_fails = 0;
457 state->ban_credits = ban_credits;
459 /* For each IP, send a TAKOVER_IP to the node that should be
460 * hosting it. Many of these will often be redundant (since
461 * the allocation won't have changed) but they can be useful
462 * to recover from inconsistencies. */
463 for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
464 struct take_ip_one_state *substate;
465 struct ctdb_public_ip ip;
467 if (tmp_ip->pnn == CTDB_UNKNOWN_PNN) {
468 /* IP will be unassigned */
469 continue;
472 substate = talloc_zero(state, struct take_ip_one_state);
473 if (tevent_req_nomem(substate, req)) {
474 return tevent_req_post(req, ev);
477 substate->req = req;
478 substate->pnn = tmp_ip->pnn;
480 substate->ip_str = ctdb_sock_addr_to_string(substate,
481 &tmp_ip->addr,
482 false);
483 if (tevent_req_nomem(substate->ip_str, req)) {
484 return tevent_req_post(req, ev);
487 ip.pnn = tmp_ip->pnn;
488 ip.addr = tmp_ip->addr;
489 ctdb_req_control_takeover_ip(&request, &ip);
490 subreq = ctdb_client_control_send(
491 state, ev, client, tmp_ip->pnn,
492 timeout, /* cumulative */
493 &request);
494 if (tevent_req_nomem(subreq, req)) {
495 return tevent_req_post(req, ev);
497 tevent_req_set_callback(subreq, take_ip_done, substate);
499 state->num_sent++;
502 /* None sent, finished... */
503 if (state->num_sent == 0) {
504 tevent_req_done(req);
505 return tevent_req_post(req, ev);
508 return req;
511 static void take_ip_done(struct tevent_req *subreq)
513 struct take_ip_one_state *substate = tevent_req_callback_data(
514 subreq, struct take_ip_one_state);
515 struct tevent_req *req = substate->req;
516 struct ctdb_reply_control *reply;
517 struct take_ip_state *state = tevent_req_data(
518 req, struct take_ip_state);
519 int ret = 0;
520 bool status;
522 status = ctdb_client_control_recv(subreq, &ret, state, &reply);
523 TALLOC_FREE(subreq);
525 if (! status) {
526 D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n",
527 substate->ip_str, substate->pnn, ret);
528 goto fail;
531 ret = ctdb_reply_control_takeover_ip(reply);
532 if (ret != 0) {
533 D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n",
534 substate->ip_str, substate->pnn, ret);
535 goto fail;
538 D_INFO("TAKEOVER_IP %s succeeded on node %u\n",
539 substate->ip_str, substate->pnn);
540 goto done;
542 fail:
543 state->ban_credits[substate->pnn]++;
544 state->num_fails++;
545 state->err_any = ret;
547 done:
548 talloc_free(substate);
550 state->num_replies++;
552 if (state->num_replies < state->num_sent) {
553 /* Not all replies received, don't go further */
554 return;
557 if (state->num_fails > 0) {
558 tevent_req_error(req, state->err_any);
559 return;
562 tevent_req_done(req);
565 static bool take_ip_recv(struct tevent_req *req, int *perr)
567 return generic_recv(req, perr);
570 /**********************************************************************/
572 struct ipreallocated_state {
573 uint32_t *pnns;
574 int count;
575 uint32_t *ban_credits;
578 static void ipreallocated_done(struct tevent_req *subreq);
580 static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx,
581 struct tevent_context *ev,
582 struct ctdb_client_context *client,
583 uint32_t *pnns,
584 int count,
585 struct timeval timeout,
586 uint32_t *ban_credits)
588 struct tevent_req *req, *subreq;
589 struct ipreallocated_state *state;
590 struct ctdb_req_control request;
592 req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state);
593 if (req == NULL) {
594 return NULL;
597 state->pnns = pnns;
598 state->count = count;
599 state->ban_credits = ban_credits;
601 ctdb_req_control_ipreallocated(&request);
602 subreq = ctdb_client_control_multi_send(state, ev, client,
603 pnns, count,
604 timeout, /* cumulative */
605 &request);
606 if (tevent_req_nomem(subreq, req)) {
607 return tevent_req_post(req, ev);
609 tevent_req_set_callback(subreq, ipreallocated_done, req);
611 return req;
614 static void ipreallocated_done(struct tevent_req *subreq)
616 struct tevent_req *req = tevent_req_callback_data(
617 subreq, struct tevent_req);
618 struct ipreallocated_state *state = tevent_req_data(
619 req, struct ipreallocated_state);
620 int *err_list = NULL;
621 int ret, i;
622 bool status, found_errors;
624 status = ctdb_client_control_multi_recv(subreq, &ret, state,
625 &err_list, NULL);
626 TALLOC_FREE(subreq);
628 if (status) {
629 D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count);
630 tevent_req_done(req);
631 return;
634 /* Get some clear error messages out of err_list and count
635 * banning credits
637 found_errors = false;
638 for (i = 0; i < state->count; i++) {
639 int err = err_list[i];
640 if (err != 0) {
641 uint32_t pnn = state->pnns[i];
643 D_ERR("IPREALLOCATED failed on node %u, ret=%d\n",
644 pnn, err);
646 state->ban_credits[pnn]++;
647 found_errors = true;
651 if (! found_errors) {
652 D_ERR("IPREALLOCATED internal error, ret=%d\n", ret);
655 tevent_req_error(req, ret);
658 static bool ipreallocated_recv(struct tevent_req *req, int *perr)
660 return generic_recv(req, perr);
663 /**********************************************************************/
665 struct start_ipreallocate_state {
666 uint32_t *pnns;
667 int count;
668 uint32_t *ban_credits;
671 static void start_ipreallocate_done(struct tevent_req *subreq);
673 static struct tevent_req *start_ipreallocate_send(
674 TALLOC_CTX *mem_ctx,
675 struct tevent_context *ev,
676 struct ctdb_client_context *client,
677 uint32_t *pnns,
678 int count,
679 struct timeval timeout,
680 uint32_t *ban_credits)
682 struct tevent_req *req, *subreq;
683 struct start_ipreallocate_state *state;
684 struct ctdb_req_control request;
686 req = tevent_req_create(mem_ctx, &state, struct start_ipreallocate_state);
687 if (req == NULL) {
688 return NULL;
691 state->pnns = pnns;
692 state->count = count;
693 state->ban_credits = ban_credits;
695 ctdb_req_control_start_ipreallocate(&request);
696 subreq = ctdb_client_control_multi_send(state, ev, client,
697 pnns, count,
698 timeout, /* cumulative */
699 &request);
700 if (tevent_req_nomem(subreq, req)) {
701 return tevent_req_post(req, ev);
703 tevent_req_set_callback(subreq, start_ipreallocate_done, req);
705 return req;
708 static void start_ipreallocate_done(struct tevent_req *subreq)
710 struct tevent_req *req = tevent_req_callback_data(
711 subreq, struct tevent_req);
712 struct start_ipreallocate_state *state = tevent_req_data(
713 req, struct start_ipreallocate_state);
714 int *err_list = NULL;
715 int ret, i;
716 bool status, found_errors;
718 status = ctdb_client_control_multi_recv(subreq, &ret, state,
719 &err_list, NULL);
720 TALLOC_FREE(subreq);
722 if (status) {
723 D_INFO("START_IPREALLOCATE succeeded on %d nodes\n", state->count);
724 tevent_req_done(req);
725 return;
728 /* Get some clear error messages out of err_list and count
729 * banning credits
731 found_errors = false;
732 for (i = 0; i < state->count; i++) {
733 int err = err_list[i];
734 if (err != 0) {
735 uint32_t pnn = state->pnns[i];
737 D_ERR("START_IPREALLOCATE failed on node %u, ret=%d\n",
738 pnn, err);
740 state->ban_credits[pnn]++;
741 found_errors = true;
745 if (! found_errors) {
746 D_ERR("STARTREALLOCATE internal error, ret=%d\n", ret);
749 tevent_req_error(req, ret);
752 static bool start_ipreallocate_recv(struct tevent_req *req, int *perr)
754 return generic_recv(req, perr);
757 /**********************************************************************/
760 * Recalculate the allocation of public IPs to nodes and have the
761 * nodes host their allocated addresses.
763 * - Get tunables
764 * - Get nodemap
765 * - Initialise IP allocation state. Pass:
766 * + algorithm to be used;
767 * + various tunables (NoIPTakeover, NoIPFailback)
768 * + list of nodes to force rebalance (internal structure, currently
769 * no way to fetch, only used by LCP2 for nodes that have had new
770 * IP addresses added).
771 * - Set IP flags for IP allocation based on node map
772 * - Retrieve known and available IP addresses (done separately so
773 * values can be faked in unit testing)
774 * - Use ipalloc_set_public_ips() to set known and available IP
775 * addresses for allocation
776 * - If cluster can't host IP addresses then jump to IPREALLOCATED
777 * - Run IP allocation algorithm
778 * - Send START_IPREALLOCATE to all nodes
779 * - Send RELEASE_IP to all nodes for IPs they should not host
780 * - Send TAKE_IP to all nodes for IPs they should host
781 * - Send IPREALLOCATED to all nodes
784 struct takeover_state {
785 struct tevent_context *ev;
786 struct ctdb_client_context *client;
787 struct timeval timeout;
788 unsigned int num_nodes;
789 uint32_t *pnns_connected;
790 int num_connected;
791 uint32_t *pnns_active;
792 int num_active;
793 uint32_t destnode;
794 uint32_t *force_rebalance_nodes;
795 struct ctdb_tunable_list *tun_list;
796 struct ipalloc_state *ipalloc_state;
797 struct ctdb_public_ip_list *known_ips;
798 struct public_ip_list *all_ips;
799 uint32_t *ban_credits;
802 static void takeover_tunables_done(struct tevent_req *subreq);
803 static void takeover_nodemap_done(struct tevent_req *subreq);
804 static void takeover_known_ips_done(struct tevent_req *subreq);
805 static void takeover_avail_ips_done(struct tevent_req *subreq);
806 static void takeover_start_ipreallocate_done(struct tevent_req *subreq);
807 static void takeover_release_ip_done(struct tevent_req *subreq);
808 static void takeover_take_ip_done(struct tevent_req *subreq);
809 static void takeover_ipreallocated(struct tevent_req *req);
810 static void takeover_ipreallocated_done(struct tevent_req *subreq);
811 static void takeover_failed(struct tevent_req *subreq, int ret);
812 static void takeover_failed_done(struct tevent_req *subreq);
814 static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx,
815 struct tevent_context *ev,
816 struct ctdb_client_context *client,
817 uint32_t *force_rebalance_nodes)
819 struct tevent_req *req, *subreq;
820 struct takeover_state *state;
821 struct ctdb_req_control request;
823 req = tevent_req_create(mem_ctx, &state, struct takeover_state);
824 if (req == NULL) {
825 return NULL;
828 state->ev = ev;
829 state->client = client;
830 state->force_rebalance_nodes = force_rebalance_nodes;
831 state->destnode = ctdb_client_pnn(client);
833 ctdb_req_control_get_all_tunables(&request);
834 subreq = ctdb_client_control_send(state, state->ev, state->client,
835 state->destnode, TIMEOUT(),
836 &request);
837 if (tevent_req_nomem(subreq, req)) {
838 return tevent_req_post(req, ev);
840 tevent_req_set_callback(subreq, takeover_tunables_done, req);
842 return req;
845 static void takeover_tunables_done(struct tevent_req *subreq)
847 struct tevent_req *req = tevent_req_callback_data(
848 subreq, struct tevent_req);
849 struct takeover_state *state = tevent_req_data(
850 req, struct takeover_state);
851 struct ctdb_reply_control *reply;
852 struct ctdb_req_control request;
853 int ret;
854 bool status;
856 status = ctdb_client_control_recv(subreq, &ret, state, &reply);
857 TALLOC_FREE(subreq);
858 if (! status) {
859 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
860 tevent_req_error(req, ret);
861 return;
864 ret = ctdb_reply_control_get_all_tunables(reply, state,
865 &state->tun_list);
866 if (ret != 0) {
867 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
868 tevent_req_error(req, ret);
869 return;
872 talloc_free(reply);
874 takeover_timeout = state->tun_list->takeover_timeout;
876 ctdb_req_control_get_nodemap(&request);
877 subreq = ctdb_client_control_send(state, state->ev, state->client,
878 state->destnode, TIMEOUT(),
879 &request);
880 if (tevent_req_nomem(subreq, req)) {
881 return;
883 tevent_req_set_callback(subreq, takeover_nodemap_done, req);
886 static void takeover_nodemap_done(struct tevent_req *subreq)
888 struct tevent_req *req = tevent_req_callback_data(
889 subreq, struct tevent_req);
890 struct takeover_state *state = tevent_req_data(
891 req, struct takeover_state);
892 struct ctdb_reply_control *reply;
893 bool status;
894 int ret;
895 struct ctdb_node_map *nodemap;
896 const char *ptr;
898 status = ctdb_client_control_recv(subreq, &ret, state, &reply);
899 TALLOC_FREE(subreq);
900 if (! status) {
901 D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
902 state->destnode, ret);
903 tevent_req_error(req, ret);
904 return;
907 ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
908 if (ret != 0) {
909 D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
910 tevent_req_error(req, ret);
911 return;
914 state->num_nodes = nodemap->num;
916 state->num_connected = list_of_connected_nodes(nodemap,
917 CTDB_UNKNOWN_PNN, state,
918 &state->pnns_connected);
919 if (state->num_connected <= 0) {
920 tevent_req_error(req, ENOMEM);
921 return;
924 state->num_active = list_of_active_nodes(nodemap,
925 CTDB_UNKNOWN_PNN, state,
926 &state->pnns_active);
927 if (state->num_active <= 0) {
928 tevent_req_error(req, ENOMEM);
929 return;
932 /* Default timeout for early jump to IPREALLOCATED. See below
933 * for explanation of 3 times...
935 state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
937 state->ban_credits = talloc_zero_array(state, uint32_t,
938 state->num_nodes);
939 if (tevent_req_nomem(state->ban_credits, req)) {
940 return;
943 ptr = getenv("CTDB_DISABLE_IP_FAILOVER");
944 if (ptr != NULL) {
945 /* IP failover is completely disabled so just send out
946 * ipreallocated event.
948 takeover_ipreallocated(req);
949 return;
952 state->ipalloc_state =
953 ipalloc_state_init(
954 state, state->num_nodes,
955 determine_algorithm(state->tun_list),
956 (state->tun_list->no_ip_takeover != 0),
957 (state->tun_list->no_ip_failback != 0),
958 state->force_rebalance_nodes);
959 if (tevent_req_nomem(state->ipalloc_state, req)) {
960 return;
963 subreq = get_public_ips_send(state, state->ev, state->client,
964 state->pnns_connected, state->num_connected,
965 state->num_nodes, state->ban_credits,
966 false);
967 if (tevent_req_nomem(subreq, req)) {
968 return;
971 tevent_req_set_callback(subreq, takeover_known_ips_done, req);
974 static void takeover_known_ips_done(struct tevent_req *subreq)
976 struct tevent_req *req = tevent_req_callback_data(
977 subreq, struct tevent_req);
978 struct takeover_state *state = tevent_req_data(
979 req, struct takeover_state);
980 int ret;
981 bool status;
982 uint32_t *pnns = NULL;
983 int count, i;
985 status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
986 TALLOC_FREE(subreq);
988 if (! status) {
989 D_ERR("Failed to fetch known public IPs\n");
990 takeover_failed(req, ret);
991 return;
994 /* Get available IPs from active nodes that actually have known IPs */
996 pnns = talloc_zero_array(state, uint32_t, state->num_active);
997 if (tevent_req_nomem(pnns, req)) {
998 return;
1001 count = 0;
1002 for (i = 0; i < state->num_active; i++) {
1003 uint32_t pnn = state->pnns_active[i];
1005 /* If pnn has IPs then fetch available IPs from it */
1006 if (state->known_ips[pnn].num > 0) {
1007 pnns[count] = pnn;
1008 count++;
1012 subreq = get_public_ips_send(state, state->ev, state->client,
1013 pnns, count,
1014 state->num_nodes, state->ban_credits,
1015 true);
1016 if (tevent_req_nomem(subreq, req)) {
1017 return;
1020 tevent_req_set_callback(subreq, takeover_avail_ips_done, req);
1023 static void takeover_avail_ips_done(struct tevent_req *subreq)
1025 struct tevent_req *req = tevent_req_callback_data(
1026 subreq, struct tevent_req);
1027 struct takeover_state *state = tevent_req_data(
1028 req, struct takeover_state);
1029 bool status;
1030 int ret;
1031 struct ctdb_public_ip_list *available_ips;
1033 status = get_public_ips_recv(subreq, &ret, state, &available_ips);
1034 TALLOC_FREE(subreq);
1036 if (! status) {
1037 D_ERR("Failed to fetch available public IPs\n");
1038 takeover_failed(req, ret);
1039 return;
1042 ipalloc_set_public_ips(state->ipalloc_state,
1043 state->known_ips, available_ips);
1045 if (! ipalloc_can_host_ips(state->ipalloc_state)) {
1046 D_NOTICE("No nodes available to host public IPs yet\n");
1047 takeover_ipreallocated(req);
1048 return;
1051 /* Do the IP reassignment calculations */
1052 state->all_ips = ipalloc(state->ipalloc_state);
1053 if (tevent_req_nomem(state->all_ips, req)) {
1054 return;
1057 /* Each of the following stages (START_IPREALLOCATE, RELEASE_IP, TAKEOVER_IP,
1058 * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
1059 * seconds. However, RELEASE_IP can take longer due to TCP
1060 * connection killing, so sometimes needs more time.
1061 * Therefore, use a cumulative timeout of TakeoverTimeout * 3
1062 * seconds across all 4 stages. Using a longer cumulative timeout (e.g.*4)
1063 * would take the takeover run timeout over 30s, which combined with database
1064 * recovery time takes the timeout too close to acceptable SMB limits.
1065 * No explicit expiry checks are
1066 * needed before each stage because tevent is smart enough to
1067 * fire the timeouts even if they are in the past. Initialise
1068 * this here so it explicitly covers the stages we're
1069 * interested in but, in particular, not the time taken by the
1070 * ipalloc().
1072 state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
1074 subreq = start_ipreallocate_send(state,
1075 state->ev,
1076 state->client,
1077 state->pnns_connected,
1078 state->num_connected,
1079 state->timeout,
1080 state->ban_credits);
1081 if (tevent_req_nomem(subreq, req)) {
1082 return;
1084 tevent_req_set_callback(subreq, takeover_start_ipreallocate_done, req);
1087 static void takeover_start_ipreallocate_done(struct tevent_req *subreq)
1089 struct tevent_req *req = tevent_req_callback_data(
1090 subreq, struct tevent_req);
1091 struct takeover_state *state = tevent_req_data(
1092 req, struct takeover_state);
1093 int ret;
1094 bool status;
1096 status = start_ipreallocate_recv(subreq, &ret);
1097 TALLOC_FREE(subreq);
1099 if (! status) {
1100 takeover_failed(req, ret);
1101 return;
1104 subreq = release_ip_send(state,
1105 state->ev,
1106 state->client,
1107 state->pnns_connected,
1108 state->num_connected,
1109 state->timeout,
1110 state->all_ips,
1111 state->ban_credits);
1112 if (tevent_req_nomem(subreq, req)) {
1113 return;
1115 tevent_req_set_callback(subreq, takeover_release_ip_done, req);
1118 static void takeover_release_ip_done(struct tevent_req *subreq)
1120 struct tevent_req *req = tevent_req_callback_data(
1121 subreq, struct tevent_req);
1122 struct takeover_state *state = tevent_req_data(
1123 req, struct takeover_state);
1124 int ret;
1125 bool status;
1127 status = release_ip_recv(subreq, &ret);
1128 TALLOC_FREE(subreq);
1130 if (! status) {
1131 takeover_failed(req, ret);
1132 return;
1135 /* All released, now for takeovers */
1137 subreq = take_ip_send(state, state->ev, state->client,
1138 state->timeout, state->all_ips,
1139 state->ban_credits);
1140 if (tevent_req_nomem(subreq, req)) {
1141 return;
1143 tevent_req_set_callback(subreq, takeover_take_ip_done, req);
1146 static void takeover_take_ip_done(struct tevent_req *subreq)
1148 struct tevent_req *req = tevent_req_callback_data(
1149 subreq, struct tevent_req);
1150 int ret = 0;
1151 bool status;
1153 status = take_ip_recv(subreq, &ret);
1154 TALLOC_FREE(subreq);
1156 if (! status) {
1157 takeover_failed(req, ret);
1158 return;
1161 takeover_ipreallocated(req);
1164 static void takeover_ipreallocated(struct tevent_req *req)
1166 struct takeover_state *state = tevent_req_data(
1167 req, struct takeover_state);
1168 struct tevent_req *subreq;
1170 subreq = ipreallocated_send(state, state->ev, state->client,
1171 state->pnns_connected,
1172 state->num_connected,
1173 state->timeout,
1174 state->ban_credits);
1175 if (tevent_req_nomem(subreq, req)) {
1176 return;
1178 tevent_req_set_callback(subreq, takeover_ipreallocated_done, req);
1181 static void takeover_ipreallocated_done(struct tevent_req *subreq)
1183 struct tevent_req *req = tevent_req_callback_data(
1184 subreq, struct tevent_req);
1185 int ret;
1186 bool status;
1188 status = ipreallocated_recv(subreq, &ret);
1189 TALLOC_FREE(subreq);
1191 if (! status) {
1192 takeover_failed(req, ret);
1193 return;
1196 tevent_req_done(req);
1199 struct takeover_failed_state {
1200 struct tevent_req *req;
1201 int ret;
1204 void takeover_failed(struct tevent_req *req, int ret)
1206 struct takeover_state *state = tevent_req_data(
1207 req, struct takeover_state);
1208 struct tevent_req *subreq;
1209 uint32_t max_pnn = CTDB_UNKNOWN_PNN;
1210 unsigned int max_credits = 0;
1211 uint32_t pnn;
1213 /* Check that bans are enabled */
1214 if (state->tun_list->enable_bans == 0) {
1215 tevent_req_error(req, ret);
1216 return;
1219 for (pnn = 0; pnn < state->num_nodes; pnn++) {
1220 if (state->ban_credits[pnn] > max_credits) {
1221 max_pnn = pnn;
1222 max_credits = state->ban_credits[pnn];
1226 if (max_credits > 0) {
1227 struct ctdb_req_message message;
1228 struct takeover_failed_state *substate;
1230 D_WARNING("Assigning banning credits to node %u\n", max_pnn);
1232 substate = talloc_zero(state, struct takeover_failed_state);
1233 if (tevent_req_nomem(substate, req)) {
1234 return;
1236 substate->req = req;
1237 substate->ret = ret;
1239 message.srvid = CTDB_SRVID_BANNING;
1240 message.data.pnn = max_pnn;
1242 subreq = ctdb_client_message_send(
1243 state, state->ev, state->client,
1244 ctdb_client_pnn(state->client),
1245 &message);
1246 if (subreq == NULL) {
1247 D_ERR("failed to assign banning credits\n");
1248 tevent_req_error(req, ret);
1249 return;
1251 tevent_req_set_callback(subreq, takeover_failed_done, substate);
1252 } else {
1253 tevent_req_error(req, ret);
1257 static void takeover_failed_done(struct tevent_req *subreq)
1259 struct takeover_failed_state *substate = tevent_req_callback_data(
1260 subreq, struct takeover_failed_state);
1261 struct tevent_req *req = substate->req;
1262 int ret;
1263 bool status;
1265 status = ctdb_client_message_recv(subreq, &ret);
1266 TALLOC_FREE(subreq);
1267 if (! status) {
1268 D_ERR("failed to assign banning credits, ret=%d\n", ret);
1271 ret = substate->ret;
1272 talloc_free(substate);
1273 tevent_req_error(req, ret);
1276 static void takeover_recv(struct tevent_req *req, int *perr)
1278 generic_recv(req, perr);
1281 static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s)
1283 char *strv = NULL;
1284 int num, i, ret;
1285 char *t;
1286 uint32_t *nodes;
1288 ret = strv_split(mem_ctx, &strv, s, ",");
1289 if (ret != 0) {
1290 D_ERR("out of memory\n");
1291 return NULL;
1294 num = strv_count(strv);
1296 nodes = talloc_array(mem_ctx, uint32_t, num);
1297 if (nodes == NULL) {
1298 D_ERR("out of memory\n");
1299 return NULL;
1302 t = NULL;
1303 for (i = 0; i < num; i++) {
1304 t = strv_next(strv, t);
1305 nodes[i] = atoi(t);
1308 return nodes;
1311 static void usage(const char *progname)
1313 fprintf(stderr,
1314 "\nUsage: %s <output-fd> <ctdb-socket-path> "
1315 "[<force-rebalance-nodes>]\n",
1316 progname);
1320 * Arguments - write fd, socket path
1322 int main(int argc, const char *argv[])
1324 int write_fd;
1325 const char *sockpath;
1326 TALLOC_CTX *mem_ctx;
1327 struct tevent_context *ev;
1328 struct ctdb_client_context *client;
1329 bool status;
1330 int ret;
1331 struct tevent_req *req;
1332 uint32_t *force_rebalance_nodes = NULL;
1334 if (argc < 3 || argc > 4) {
1335 usage(argv[0]);
1336 exit(1);
1339 write_fd = atoi(argv[1]);
1340 sockpath = argv[2];
1342 mem_ctx = talloc_new(NULL);
1343 if (mem_ctx == NULL) {
1344 fprintf(stderr, "talloc_new() failed\n");
1345 ret = ENOMEM;
1346 goto done;
1349 if (argc == 4) {
1350 force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
1351 if (force_rebalance_nodes == NULL) {
1352 usage(argv[0]);
1353 ret = EINVAL;
1354 goto done;
1358 ret = logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
1359 if (ret != 0) {
1360 fprintf(stderr,
1361 "ctdb-takeover: Unable to initialize logging\n");
1362 goto done;
1365 ev = tevent_context_init(mem_ctx);
1366 if (ev == NULL) {
1367 D_ERR("tevent_context_init() failed\n");
1368 ret = ENOMEM;
1369 goto done;
1372 status = logging_setup_sighup_handler(ev, mem_ctx, NULL, NULL);
1373 if (!status) {
1374 D_ERR("logging_setup_sighup_handler() failed\n");
1375 ret = ENOMEM;
1376 goto done;
1379 ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
1380 if (ret != 0) {
1381 D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
1382 goto done;
1385 req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes);
1386 if (req == NULL) {
1387 D_ERR("takeover_send() failed\n");
1388 ret = 1;
1389 goto done;
1392 if (! tevent_req_poll(req, ev)) {
1393 D_ERR("tevent_req_poll() failed\n");
1394 ret = 1;
1395 goto done;
1398 takeover_recv(req, &ret);
1399 TALLOC_FREE(req);
1400 if (ret != 0) {
1401 D_ERR("takeover run failed, ret=%d\n", ret);
1404 done:
1405 sys_write_v(write_fd, &ret, sizeof(ret));
1407 talloc_free(mem_ctx);
1408 return ret;