From f1a20d748f6ab4702be5b17047a3fbfa0f3e8d0c Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Fri, 7 Feb 2014 17:19:20 +1100 Subject: [PATCH] ctdb-recoverd: Fix a bug in the LCP2 rebalancing code srcimbl gets changed on every iteration of the loop. The value that should be stored for the new imbalance of the source node is minsrcimbl. To help diagnose this, added some extra debug that can be left in. The extra debug changes the output of a couple of tests. Note that the resulting IP allocations in those tests is unchanged - only the debug output is changed. Also add some new tests that illustrates the bug. Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs --- ctdb/server/ctdb_takeover.c | 5 +- ctdb/tests/takeover/lcp2.005.sh | 63 ++++-- ctdb/tests/takeover/lcp2.023.sh | 34 ++- ctdb/tests/takeover/lcp2.031.sh | 143 +++++++++++++ ctdb/tests/takeover/lcp2.032.sh | 450 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 673 insertions(+), 22 deletions(-) create mode 100755 ctdb/tests/takeover/lcp2.031.sh create mode 100755 ctdb/tests/takeover/lcp2.032.sh diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c index c21736ed085..d3a6e25aa3a 100644 --- a/ctdb/server/ctdb_takeover.c +++ b/ctdb/server/ctdb_takeover.c @@ -1958,7 +1958,7 @@ static bool lcp2_failback_candidate(struct ctdb_context *ctdb, mindstnode, mindstimbl - lcp2_imbalances[mindstnode])); - lcp2_imbalances[srcnode] = srcimbl; + lcp2_imbalances[srcnode] = minsrcimbl; lcp2_imbalances[mindstnode] = mindstimbl; minip->pnn = mindstnode; @@ -2024,10 +2024,13 @@ try_again: * iterate through candidates. Usually the 1st one will be * used, so this doesn't cost much... */ + DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n")); + DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n")); lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes); for (i=0; i 192.168.21.254 -> 0 [+0] @@ -32,8 +37,13 @@ DATE TIME [PID]: 1 [-121110] -> 192.168.20.249 -> 0 [+0] DATE TIME [PID]: 1 [-121110] -> 192.168.20.249 -> 2 [+0] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: 1 [-121363] -> 192.168.20.253 -> 0 [+0] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [0] +DATE TIME [PID]: 1 [417803] +DATE TIME [PID]: 2 [0] DATE TIME [PID]: ---------------------------------------- -DATE TIME [PID]: CONSIDERING MOVES FROM 1 [418056] +DATE TIME [PID]: CONSIDERING MOVES FROM 1 [417803] DATE TIME [PID]: 1 [-102557] -> 192.168.21.254 -> 0 [+14161] DATE TIME [PID]: 1 [-102557] -> 192.168.21.254 -> 2 [+0] DATE TIME [PID]: 1 [-102810] -> 192.168.21.253 -> 0 [+14161] @@ -52,8 +62,13 @@ DATE TIME [PID]: 1 [-105485] -> 192.168.20.249 -> 0 [+15625] DATE TIME [PID]: 1 [-105485] -> 192.168.20.249 -> 2 [+0] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: 1 [-105738] -> 192.168.20.251 -> 2 [+0] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [0] +DATE TIME [PID]: 1 [312065] +DATE TIME [PID]: 2 [0] DATE TIME [PID]: ---------------------------------------- -DATE TIME [PID]: CONSIDERING MOVES FROM 1 [312571] +DATE TIME [PID]: CONSIDERING MOVES FROM 1 [312065] DATE TIME [PID]: 1 [-88396] -> 192.168.21.254 -> 0 [+14161] DATE TIME [PID]: 1 [-88396] -> 192.168.21.254 -> 2 [+14161] DATE TIME [PID]: 1 [-88649] -> 192.168.21.253 -> 0 [+14161] @@ -70,8 +85,13 @@ DATE TIME [PID]: 1 [-89609] -> 192.168.20.249 -> 0 [+15625] DATE TIME [PID]: 1 [-89609] -> 192.168.20.249 -> 2 [+15876] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: 1 [-88649] -> 192.168.21.253 -> 0 [+14161] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [14161] +DATE TIME [PID]: 1 [223416] +DATE TIME [PID]: 2 [0] DATE TIME [PID]: ---------------------------------------- -DATE TIME [PID]: CONSIDERING MOVES FROM 1 [222962] +DATE TIME [PID]: CONSIDERING MOVES FROM 1 [223416] DATE TIME [PID]: 1 [-72520] -> 192.168.21.254 -> 0 [+30037] DATE TIME [PID]: 1 [-72520] -> 192.168.21.254 -> 2 [+14161] DATE TIME [PID]: 1 [-72520] -> 192.168.21.252 -> 0 [+30290] @@ -86,8 +106,13 @@ DATE TIME [PID]: 1 [-75448] -> 192.168.20.249 -> 0 [+29786] DATE TIME [PID]: 1 [-75448] -> 192.168.20.249 -> 2 [+15876] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: 1 [-75448] -> 192.168.20.254 -> 2 [+15625] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [14161] +DATE TIME [PID]: 1 [147968] +DATE TIME [PID]: 2 [15625] DATE TIME [PID]: ---------------------------------------- -DATE TIME [PID]: CONSIDERING MOVES FROM 1 [147514] +DATE TIME [PID]: CONSIDERING MOVES FROM 1 [147968] DATE TIME [PID]: 1 [-58359] -> 192.168.21.254 -> 0 [+30037] DATE TIME [PID]: 1 [-58359] -> 192.168.21.254 -> 2 [+28322] DATE TIME [PID]: 1 [-58359] -> 192.168.21.252 -> 0 [+30290] @@ -100,8 +125,13 @@ DATE TIME [PID]: 1 [-59823] -> 192.168.20.249 -> 0 [+29786] DATE TIME [PID]: 1 [-59823] -> 192.168.20.249 -> 2 [+31501] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: 1 [-59823] -> 192.168.20.250 -> 0 [+29786] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [43947] +DATE TIME [PID]: 1 [88145] +DATE TIME [PID]: 2 [15625] DATE TIME [PID]: ---------------------------------------- -DATE TIME [PID]: CONSIDERING MOVES FROM 1 [87691] +DATE TIME [PID]: CONSIDERING MOVES FROM 1 [88145] DATE TIME [PID]: 1 [-44198] -> 192.168.21.254 -> 0 [+44198] DATE TIME [PID]: 1 [-44198] -> 192.168.21.254 -> 2 [+28322] DATE TIME [PID]: 1 [-44198] -> 192.168.21.252 -> 0 [+44451] @@ -112,6 +142,11 @@ DATE TIME [PID]: 1 [-43947] -> 192.168.20.249 -> 0 [+45662] DATE TIME [PID]: 1 [-43947] -> 192.168.20.249 -> 2 [+31501] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: 1 [-44198] -> 192.168.21.254 -> 2 [+28322] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [43947] +DATE TIME [PID]: 1 [43947] +DATE TIME [PID]: 2 [43947] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: CONSIDERING MOVES FROM 0 [43947] DATE TIME [PID]: 0 [-28322] -> 192.168.21.253 -> 0 [+28322] @@ -122,6 +157,15 @@ DATE TIME [PID]: 0 [-29786] -> 192.168.20.250 -> 0 [+29786] DATE TIME [PID]: 0 [-29786] -> 192.168.20.250 -> 2 [+45915] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: CONSIDERING MOVES FROM 1 [43947] +DATE TIME [PID]: 1 [-28322] -> 192.168.21.252 -> 0 [+44451] +DATE TIME [PID]: 1 [-28322] -> 192.168.21.252 -> 2 [+44198] +DATE TIME [PID]: 1 [-29786] -> 192.168.20.252 -> 0 [+45915] +DATE TIME [PID]: 1 [-29786] -> 192.168.20.252 -> 2 [+45662] +DATE TIME [PID]: 1 [-29786] -> 192.168.20.249 -> 0 [+45662] +DATE TIME [PID]: 1 [-29786] -> 192.168.20.249 -> 2 [+45662] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: CONSIDERING MOVES FROM 2 [43947] DATE TIME [PID]: 2 [-28322] -> 192.168.21.254 -> 0 [+44198] DATE TIME [PID]: 2 [-28322] -> 192.168.21.254 -> 2 [+28322] @@ -130,15 +174,6 @@ DATE TIME [PID]: 2 [-29786] -> 192.168.20.254 -> 2 [+29786] DATE TIME [PID]: 2 [-29786] -> 192.168.20.251 -> 0 [+45915] DATE TIME [PID]: 2 [-29786] -> 192.168.20.251 -> 2 [+29786] DATE TIME [PID]: ---------------------------------------- -DATE TIME [PID]: ---------------------------------------- -DATE TIME [PID]: CONSIDERING MOVES FROM 1 [43744] -DATE TIME [PID]: 1 [-28322] -> 192.168.21.252 -> 0 [+44451] -DATE TIME [PID]: 1 [-28322] -> 192.168.21.252 -> 2 [+44198] -DATE TIME [PID]: 1 [-29786] -> 192.168.20.252 -> 0 [+45915] -DATE TIME [PID]: 1 [-29786] -> 192.168.20.252 -> 2 [+45662] -DATE TIME [PID]: 1 [-29786] -> 192.168.20.249 -> 0 [+45662] -DATE TIME [PID]: 1 [-29786] -> 192.168.20.249 -> 2 [+45662] -DATE TIME [PID]: ---------------------------------------- 192.168.21.254 2 192.168.21.253 0 192.168.21.252 1 diff --git a/ctdb/tests/takeover/lcp2.023.sh b/ctdb/tests/takeover/lcp2.023.sh index 9bffc58c6ba..0f35b3ea88d 100755 --- a/ctdb/tests/takeover/lcp2.023.sh +++ b/ctdb/tests/takeover/lcp2.023.sh @@ -10,6 +10,11 @@ required_result < 192.168.21.254 -> 1 [+0] @@ -19,6 +24,11 @@ DATE TIME [PID]: 2 [-59823] -> 192.168.20.251 -> 1 [+0] DATE TIME [PID]: 2 [-59823] -> 192.168.20.249 -> 1 [+0] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: 2 [-59823] -> 192.168.20.251 -> 1 [+0] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [89609] +DATE TIME [PID]: 1 [0] +DATE TIME [PID]: 2 [88145] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: CONSIDERING MOVES FROM 0 [89609] DATE TIME [PID]: 0 [-42483] -> 192.168.21.253 -> 1 [+14161] @@ -27,6 +37,11 @@ DATE TIME [PID]: 0 [-45662] -> 192.168.20.252 -> 1 [+15625] DATE TIME [PID]: 0 [-45411] -> 192.168.20.250 -> 1 [+16129] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: 0 [-45662] -> 192.168.20.254 -> 1 [+15625] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [43947] +DATE TIME [PID]: 1 [15625] +DATE TIME [PID]: 2 [88145] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: CONSIDERING MOVES FROM 2 [88145] DATE TIME [PID]: 2 [-44198] -> 192.168.21.254 -> 1 [+28322] @@ -35,24 +50,29 @@ DATE TIME [PID]: 2 [-43947] -> 192.168.20.253 -> 1 [+31501] DATE TIME [PID]: 2 [-43947] -> 192.168.20.249 -> 1 [+31501] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: 2 [-44198] -> 192.168.21.254 -> 1 [+28322] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [43947] +DATE TIME [PID]: 1 [43947] +DATE TIME [PID]: 2 [43947] DATE TIME [PID]: ---------------------------------------- -DATE TIME [PID]: CONSIDERING MOVES FROM 0 [44198] +DATE TIME [PID]: CONSIDERING MOVES FROM 0 [43947] DATE TIME [PID]: 0 [-28322] -> 192.168.21.253 -> 1 [+44198] DATE TIME [PID]: 0 [-29786] -> 192.168.20.252 -> 1 [+45662] DATE TIME [PID]: 0 [-29786] -> 192.168.20.250 -> 1 [+45915] DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: ---------------------------------------- -DATE TIME [PID]: CONSIDERING MOVES FROM 2 [44198] -DATE TIME [PID]: 2 [-28322] -> 192.168.21.252 -> 1 [+44198] -DATE TIME [PID]: 2 [-29786] -> 192.168.20.253 -> 1 [+45662] -DATE TIME [PID]: 2 [-29786] -> 192.168.20.249 -> 1 [+45662] -DATE TIME [PID]: ---------------------------------------- -DATE TIME [PID]: ---------------------------------------- DATE TIME [PID]: CONSIDERING MOVES FROM 1 [43947] DATE TIME [PID]: 1 [-28322] -> 192.168.21.254 -> 1 [+28322] DATE TIME [PID]: 1 [-29786] -> 192.168.20.254 -> 1 [+29786] DATE TIME [PID]: 1 [-29786] -> 192.168.20.251 -> 1 [+29786] DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: CONSIDERING MOVES FROM 2 [43947] +DATE TIME [PID]: 2 [-28322] -> 192.168.21.252 -> 1 [+44198] +DATE TIME [PID]: 2 [-29786] -> 192.168.20.253 -> 1 [+45662] +DATE TIME [PID]: 2 [-29786] -> 192.168.20.249 -> 1 [+45662] +DATE TIME [PID]: ---------------------------------------- 192.168.21.254 1 192.168.21.253 0 192.168.21.252 2 diff --git a/ctdb/tests/takeover/lcp2.031.sh b/ctdb/tests/takeover/lcp2.031.sh new file mode 100755 index 00000000000..6c5acf91ffa --- /dev/null +++ b/ctdb/tests/takeover/lcp2.031.sh @@ -0,0 +1,143 @@ +#!/bin/sh + +. "${TEST_SCRIPTS_DIR}/unit.sh" + +define_test "12+4 IPs, 4 nodes, 3 -> 4 healthy" + +export CTDB_TEST_LOGLEVEL=4 + +required_result < 130.216.30.178 -> 0 [+0] +DATE TIME [PID]: 1 [-64566] -> 130.216.30.176 -> 0 [+0] +DATE TIME [PID]: 1 [-64315] -> 130.216.30.175 -> 0 [+0] +DATE TIME [PID]: 1 [-64315] -> 130.216.30.171 -> 0 [+0] +DATE TIME [PID]: 1 [-52489] -> 10.19.99.253 -> 0 [+0] +DATE TIME [PID]: 1 [-52489] -> 10.19.99.250 -> 0 [+0] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: 1 [-64566] -> 130.216.30.178 -> 0 [+0] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [0] +DATE TIME [PID]: 1 [116804] +DATE TIME [PID]: 2 [128630] +DATE TIME [PID]: 3 [128881] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: CONSIDERING MOVES FROM 3 [128881] +DATE TIME [PID]: 3 [-55099] -> 130.216.30.180 -> 0 [+15625] +DATE TIME [PID]: 3 [-55099] -> 130.216.30.177 -> 0 [+15876] +DATE TIME [PID]: 3 [-55350] -> 130.216.30.174 -> 0 [+15129] +DATE TIME [PID]: 3 [-55350] -> 130.216.30.173 -> 0 [+15129] +DATE TIME [PID]: 3 [-36864] -> 10.19.99.252 -> 0 [+9216] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: 3 [-55350] -> 130.216.30.174 -> 0 [+15129] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [15129] +DATE TIME [PID]: 1 [116804] +DATE TIME [PID]: 2 [128630] +DATE TIME [PID]: 3 [73531] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: CONSIDERING MOVES FROM 2 [128630] +DATE TIME [PID]: 2 [-55099] -> 130.216.30.181 -> 0 [+30754] +DATE TIME [PID]: 2 [-55099] -> 130.216.30.179 -> 0 [+31258] +DATE TIME [PID]: 2 [-55099] -> 130.216.30.172 -> 0 [+31005] +DATE TIME [PID]: 2 [-55099] -> 130.216.30.170 -> 0 [+30754] +DATE TIME [PID]: 2 [-36864] -> 10.19.99.251 -> 0 [+18432] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: 2 [-55099] -> 130.216.30.181 -> 0 [+30754] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [45883] +DATE TIME [PID]: 1 [116804] +DATE TIME [PID]: 2 [73531] +DATE TIME [PID]: 3 [73531] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: CONSIDERING MOVES FROM 1 [116804] +DATE TIME [PID]: 1 [-48690] -> 130.216.30.176 -> 0 [+46630] +DATE TIME [PID]: 1 [-49186] -> 130.216.30.175 -> 0 [+46387] +DATE TIME [PID]: 1 [-49186] -> 130.216.30.171 -> 0 [+45883] +DATE TIME [PID]: 1 [-43273] -> 10.19.99.253 -> 0 [+27648] +DATE TIME [PID]: 1 [-43273] -> 10.19.99.250 -> 0 [+27648] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: 1 [-43273] -> 10.19.99.253 -> 0 [+27648] +DATE TIME [PID]: +++++++++++++++++++++++++++++++++++++++++ +DATE TIME [PID]: Selecting most imbalanced node from: +DATE TIME [PID]: 0 [73531] +DATE TIME [PID]: 1 [73531] +DATE TIME [PID]: 2 [73531] +DATE TIME [PID]: 3 [73531] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: CONSIDERING MOVES FROM 0 [73531] +DATE TIME [PID]: 0 [-39970] -> 130.216.30.181 -> 0 [+39970] +DATE TIME [PID]: 0 [-39970] -> 130.216.30.178 -> 0 [+39970] +DATE TIME [PID]: 0 [-39474] -> 130.216.30.174 -> 0 [+39474] +DATE TIME [PID]: 0 [-27648] -> 10.19.99.253 -> 0 [+27648] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: CONSIDERING MOVES FROM 1 [73531] +DATE TIME [PID]: 1 [-39474] -> 130.216.30.176 -> 0 [+55846] +DATE TIME [PID]: 1 [-39970] -> 130.216.30.175 -> 0 [+55603] +DATE TIME [PID]: 1 [-39970] -> 130.216.30.171 -> 0 [+55099] +DATE TIME [PID]: 1 [-27648] -> 10.19.99.250 -> 0 [+43273] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: CONSIDERING MOVES FROM 2 [73531] +DATE TIME [PID]: 2 [-39474] -> 130.216.30.179 -> 0 [+56099] +DATE TIME [PID]: 2 [-39970] -> 130.216.30.172 -> 0 [+55350] +DATE TIME [PID]: 2 [-39970] -> 130.216.30.170 -> 0 [+55099] +DATE TIME [PID]: 2 [-27648] -> 10.19.99.251 -> 0 [+43273] +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: ---------------------------------------- +DATE TIME [PID]: CONSIDERING MOVES FROM 3 [73531] +DATE TIME [PID]: 3 [-39970] -> 130.216.30.180 -> 0 [+56099] +DATE TIME [PID]: 3 [-39970] -> 130.216.30.177 -> 0 [+55846] +DATE TIME [PID]: 3 [-39474] -> 130.216.30.173 -> 0 [+55350] +DATE TIME [PID]: 3 [-27648] -> 10.19.99.252 -> 0 [+43777] +DATE TIME [PID]: ---------------------------------------- +130.216.30.181 0 +130.216.30.180 3 +130.216.30.179 2 +130.216.30.178 0 +130.216.30.177 3 +130.216.30.176 1 +130.216.30.175 1 +130.216.30.174 0 +130.216.30.173 3 +130.216.30.172 2 +130.216.30.171 1 +130.216.30.170 2 +10.19.99.253 0 +10.19.99.252 3 +10.19.99.251 2 +10.19.99.250 1 +EOF + +simple_test 0,0,0,0 < continue node 3, all healthy" + +required_result < stop node 0" + +required_result < stop node 1" + +required_result < Stop node 2" + +required_result < stop node 3" + +required_result < node 0 stopped" + +required_result < node 1 stopped" + +required_result < node 2 stopped" + +required_result < node 3 stopped" + +required_result <