s3: VFS: vfs_snapper: Make chflags return errno = EROFS on a shadow copy path.
[Samba.git] / ctdb / server / ipalloc_lcp2.c
blobbc2936bb76ee72fb2d09e42462a2eee241858684
1 /*
2 ctdb ip takeover code
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "replace.h"
23 #include "system/network.h"
25 #include "lib/util/debug.h"
26 #include "common/logging.h"
28 #include "protocol/protocol_util.h"
30 #include "server/ipalloc_private.h"
33 * This is the length of the longtest common prefix between the IPs.
34 * It is calculated by XOR-ing the 2 IPs together and counting the
35 * number of leading zeroes. The implementation means that all
36 * addresses end up being 128 bits long.
38 * FIXME? Should we consider IPv4 and IPv6 separately given that the
39 * 12 bytes of 0 prefix padding will hurt the algorithm if there are
40 * lots of nodes and IP addresses?
42 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
44 uint32_t ip1_k[IP_KEYLEN];
45 uint32_t *t;
46 int i;
47 uint32_t x;
49 uint32_t distance = 0;
51 memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
52 t = ip_key(ip2);
53 for (i=0; i<IP_KEYLEN; i++) {
54 x = ip1_k[i] ^ t[i];
55 if (x == 0) {
56 distance += 32;
57 } else {
58 /* Count number of leading zeroes.
59 * FIXME? This could be optimised...
61 while ((x & ((uint32_t)1 << 31)) == 0) {
62 x <<= 1;
63 distance += 1;
68 return distance;
71 /* Calculate the IP distance for the given IP relative to IPs on the
72 given node. The ips argument is generally the all_ips variable
73 used in the main part of the algorithm.
75 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
76 struct public_ip_list *ips,
77 unsigned int pnn)
79 struct public_ip_list *t;
80 uint32_t d;
82 uint32_t sum = 0;
84 for (t = ips; t != NULL; t = t->next) {
85 if (t->pnn != pnn) {
86 continue;
89 /* Optimisation: We never calculate the distance
90 * between an address and itself. This allows us to
91 * calculate the effect of removing an address from a
92 * node by simply calculating the distance between
93 * that address and all of the exitsing addresses.
94 * Moreover, we assume that we're only ever dealing
95 * with addresses from all_ips so we can identify an
96 * address via a pointer rather than doing a more
97 * expensive address comparison. */
98 if (&(t->addr) == ip) {
99 continue;
102 d = ip_distance(ip, &(t->addr));
103 sum += d * d; /* Cheaper than pulling in math.h :-) */
106 return sum;
109 /* Return the LCP2 imbalance metric for addresses currently assigned
110 to the given node.
112 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips,
113 unsigned int pnn)
115 struct public_ip_list *t;
117 uint32_t imbalance = 0;
119 for (t = all_ips; t != NULL; t = t->next) {
120 if (t->pnn != pnn) {
121 continue;
123 /* Pass the rest of the IPs rather than the whole
124 all_ips input list.
126 imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
129 return imbalance;
132 static bool lcp2_init(struct ipalloc_state *ipalloc_state,
133 uint32_t **lcp2_imbalances,
134 bool **rebalance_candidates)
136 unsigned int i, numnodes;
137 struct public_ip_list *t;
139 numnodes = ipalloc_state->num;
141 *rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
142 if (*rebalance_candidates == NULL) {
143 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
144 return false;
146 *lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
147 if (*lcp2_imbalances == NULL) {
148 DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
149 return false;
152 for (i=0; i<numnodes; i++) {
153 (*lcp2_imbalances)[i] =
154 lcp2_imbalance(ipalloc_state->all_ips, i);
155 /* First step: assume all nodes are candidates */
156 (*rebalance_candidates)[i] = true;
159 /* 2nd step: if a node has IPs assigned then it must have been
160 * healthy before, so we remove it from consideration. This
161 * is overkill but is all we have because we don't maintain
162 * state between takeover runs. An alternative would be to
163 * keep state and invalidate it every time the recovery master
164 * changes.
166 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
167 if (t->pnn != CTDB_UNKNOWN_PNN) {
168 (*rebalance_candidates)[t->pnn] = false;
172 /* 3rd step: if a node is forced to re-balance then
173 we allow failback onto the node */
174 if (ipalloc_state->force_rebalance_nodes == NULL) {
175 return true;
177 for (i = 0;
178 i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
179 i++) {
180 uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
181 if (pnn >= numnodes) {
182 DEBUG(DEBUG_ERR,
183 (__location__ "unknown node %u\n", pnn));
184 continue;
187 DEBUG(DEBUG_NOTICE,
188 ("Forcing rebalancing of IPs to node %u\n", pnn));
189 (*rebalance_candidates)[pnn] = true;
192 return true;
195 /* Allocate any unassigned addresses using the LCP2 algorithm to find
196 * the IP/node combination that will cost the least.
198 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
199 uint32_t *lcp2_imbalances)
201 struct public_ip_list *t;
202 unsigned int dstnode, numnodes;
204 unsigned int minnode;
205 uint32_t mindsum, dstdsum, dstimbl;
206 uint32_t minimbl = 0;
207 struct public_ip_list *minip;
209 bool should_loop = true;
210 bool have_unassigned = true;
212 numnodes = ipalloc_state->num;
214 while (have_unassigned && should_loop) {
215 should_loop = false;
217 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
218 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
220 minnode = CTDB_UNKNOWN_PNN;
221 mindsum = 0;
222 minip = NULL;
224 /* loop over each unassigned ip. */
225 for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
226 if (t->pnn != CTDB_UNKNOWN_PNN) {
227 continue;
230 for (dstnode = 0; dstnode < numnodes; dstnode++) {
231 /* only check nodes that can actually takeover this ip */
232 if (!can_node_takeover_ip(ipalloc_state,
233 dstnode,
234 t)) {
235 /* no it couldnt so skip to the next node */
236 continue;
239 dstdsum = ip_distance_2_sum(&(t->addr),
240 ipalloc_state->all_ips,
241 dstnode);
242 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
243 DEBUG(DEBUG_DEBUG,
244 (" %s -> %d [+%d]\n",
245 ctdb_sock_addr_to_string(ipalloc_state,
246 &(t->addr),
247 false),
248 dstnode,
249 dstimbl - lcp2_imbalances[dstnode]));
252 if (minnode == CTDB_UNKNOWN_PNN ||
253 dstdsum < mindsum) {
254 minnode = dstnode;
255 minimbl = dstimbl;
256 mindsum = dstdsum;
257 minip = t;
258 should_loop = true;
263 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
265 /* If we found one then assign it to the given node. */
266 if (minnode != CTDB_UNKNOWN_PNN) {
267 minip->pnn = minnode;
268 lcp2_imbalances[minnode] = minimbl;
269 DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
270 ctdb_sock_addr_to_string(
271 ipalloc_state,
272 &(minip->addr), false),
273 minnode,
274 mindsum));
277 /* There might be a better way but at least this is clear. */
278 have_unassigned = false;
279 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
280 if (t->pnn == CTDB_UNKNOWN_PNN) {
281 have_unassigned = true;
286 /* We know if we have an unassigned addresses so we might as
287 * well optimise.
289 if (have_unassigned) {
290 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
291 if (t->pnn == CTDB_UNKNOWN_PNN) {
292 DEBUG(DEBUG_WARNING,
293 ("Failed to find node to cover ip %s\n",
294 ctdb_sock_addr_to_string(ipalloc_state,
295 &t->addr,
296 false)));
302 /* LCP2 algorithm for rebalancing the cluster. Given a candidate node
303 * to move IPs from, determines the best IP/destination node
304 * combination to move from the source node.
306 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
307 unsigned int srcnode,
308 uint32_t *lcp2_imbalances,
309 bool *rebalance_candidates)
311 unsigned int dstnode, mindstnode, numnodes;
312 uint32_t srcdsum, dstimbl, dstdsum;
313 uint32_t minsrcimbl, mindstimbl;
314 struct public_ip_list *minip;
315 struct public_ip_list *t;
317 /* Find an IP and destination node that best reduces imbalance. */
318 minip = NULL;
319 minsrcimbl = 0;
320 mindstnode = CTDB_UNKNOWN_PNN;
321 mindstimbl = 0;
323 numnodes = ipalloc_state->num;
325 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
326 DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
327 srcnode, lcp2_imbalances[srcnode]));
329 for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
330 uint32_t srcimbl;
332 /* Only consider addresses on srcnode. */
333 if (t->pnn != srcnode) {
334 continue;
337 /* What is this IP address costing the source node? */
338 srcdsum = ip_distance_2_sum(&(t->addr),
339 ipalloc_state->all_ips,
340 srcnode);
341 srcimbl = lcp2_imbalances[srcnode] - srcdsum;
343 /* Consider this IP address would cost each potential
344 * destination node. Destination nodes are limited to
345 * those that are newly healthy, since we don't want
346 * to do gratuitous failover of IPs just to make minor
347 * balance improvements.
349 for (dstnode = 0; dstnode < numnodes; dstnode++) {
350 if (!rebalance_candidates[dstnode]) {
351 continue;
354 /* only check nodes that can actually takeover this ip */
355 if (!can_node_takeover_ip(ipalloc_state, dstnode,
356 t)) {
357 /* no it couldnt so skip to the next node */
358 continue;
361 dstdsum = ip_distance_2_sum(&(t->addr),
362 ipalloc_state->all_ips,
363 dstnode);
364 dstimbl = lcp2_imbalances[dstnode] + dstdsum;
365 DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
366 srcnode, -srcdsum,
367 ctdb_sock_addr_to_string(
368 ipalloc_state,
369 &(t->addr), false),
370 dstnode, dstdsum));
372 if ((dstimbl < lcp2_imbalances[srcnode]) &&
373 (dstdsum < srcdsum) && \
374 ((mindstnode == CTDB_UNKNOWN_PNN) || \
375 ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
377 minip = t;
378 minsrcimbl = srcimbl;
379 mindstnode = dstnode;
380 mindstimbl = dstimbl;
384 DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
386 if (mindstnode != CTDB_UNKNOWN_PNN) {
387 /* We found a move that makes things better... */
388 DEBUG(DEBUG_INFO,
389 ("%d [%d] -> %s -> %d [+%d]\n",
390 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
391 ctdb_sock_addr_to_string(ipalloc_state,
392 &(minip->addr), false),
393 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
396 lcp2_imbalances[srcnode] = minsrcimbl;
397 lcp2_imbalances[mindstnode] = mindstimbl;
398 minip->pnn = mindstnode;
400 return true;
403 return false;
406 struct lcp2_imbalance_pnn {
407 uint32_t imbalance;
408 unsigned int pnn;
411 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
413 const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
414 const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
416 if (lipa->imbalance > lipb->imbalance) {
417 return -1;
418 } else if (lipa->imbalance == lipb->imbalance) {
419 return 0;
420 } else {
421 return 1;
425 /* LCP2 algorithm for rebalancing the cluster. This finds the source
426 * node with the highest LCP2 imbalance, and then determines the best
427 * IP/destination node combination to move from the source node.
429 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
430 uint32_t *lcp2_imbalances,
431 bool *rebalance_candidates)
433 int i, numnodes;
434 struct lcp2_imbalance_pnn * lips;
435 bool again;
437 numnodes = ipalloc_state->num;
439 try_again:
440 /* Put the imbalances and nodes into an array, sort them and
441 * iterate through candidates. Usually the 1st one will be
442 * used, so this doesn't cost much...
444 DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
445 DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
446 lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
447 for (i = 0; i < numnodes; i++) {
448 lips[i].imbalance = lcp2_imbalances[i];
449 lips[i].pnn = i;
450 DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
452 qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
453 lcp2_cmp_imbalance_pnn);
455 again = false;
456 for (i = 0; i < numnodes; i++) {
457 /* This means that all nodes had 0 or 1 addresses, so
458 * can't be imbalanced.
460 if (lips[i].imbalance == 0) {
461 break;
464 if (lcp2_failback_candidate(ipalloc_state,
465 lips[i].pnn,
466 lcp2_imbalances,
467 rebalance_candidates)) {
468 again = true;
469 break;
473 talloc_free(lips);
474 if (again) {
475 goto try_again;
479 bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
481 uint32_t *lcp2_imbalances;
482 bool *rebalance_candidates;
483 int numnodes, i;
484 bool have_rebalance_candidates;
485 bool ret = true;
487 unassign_unsuitable_ips(ipalloc_state);
489 if (!lcp2_init(ipalloc_state,
490 &lcp2_imbalances, &rebalance_candidates)) {
491 ret = false;
492 goto finished;
495 lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
497 /* If we don't want IPs to fail back then don't rebalance IPs. */
498 if (ipalloc_state->no_ip_failback) {
499 goto finished;
502 /* It is only worth continuing if we have suitable target
503 * nodes to transfer IPs to. This check is much cheaper than
504 * continuing on...
506 numnodes = ipalloc_state->num;
507 have_rebalance_candidates = false;
508 for (i=0; i<numnodes; i++) {
509 if (rebalance_candidates[i]) {
510 have_rebalance_candidates = true;
511 break;
514 if (!have_rebalance_candidates) {
515 goto finished;
518 /* Now, try to make sure the ip adresses are evenly distributed
519 across the nodes.
521 lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
523 finished:
524 return ret;