ctdb/tests/simple/77_ctdb_db_recovery.sh

   1 #!/bin/bash
   2
   3 test_info()
   4 {
   5     cat <<EOF
   6 Recovery can under certain circumstances lead to old record copies
   7 resurrecting: Recovery selects the newest record copy purely by RSN. At
   8 the end of the recovery, the recovery master is the dmaster for all
   9 records in all (non-persistent) databases. And the other nodes locally
  10 hold the complete copy of the databases. The bug is that the recovery
  11 process does not increment the RSN on the recovery master at the end of
  12 the recovery. Now clients acting directly on the Recovery master will
  13 directly change a record's content on the recmaster without migration
  14 and hence without RSN bump.  So a subsequent recovery can not tell that
  15 the recmaster's copy is newer than the copies on the other nodes, since
  16 their RSN is the same. Hence, if the recmaster is not node 0 (or more
  17 precisely not the active node with the lowest node number), the recovery
  18 will choose copies from nodes with lower number and stick to these.
  19
  20 Steps:
  21
  22 1. Create a test database
  23 2. Add a record with value value1 on recovery master
  24 3. Force a recovery
  25 4. Update the record with value value2 on recovery master
  26 5. Force a recovery
  27 6. Fetch the record
  28
  29 Expected results:
  30
  31 * The record should have value value2 and not value1
  32
  33 EOF
  34 }
  35
  36 . "${TEST_SCRIPTS_DIR}/integration.bash"
  37
  38 ctdb_test_init "$@"
  39
  40 set -e
  41
  42 cluster_is_healthy
  43
  44 # Reset configuration
  45 ctdb_restart_when_done
  46
  47 #
  48 # Main test
  49 #
  50 TESTDB="rec_test.tdb"
  51
  52 status=0
  53
  54 # Make sure node 0 is not the recovery master
  55 echo "find out which node is recmaster"
  56 try_command_on_node any $CTDB recmaster
  57 recmaster="$out"
  58 if [ "$recmaster" = "0" ]; then
  59     echo "node 0 is recmaster, disable recmasterrole on node 0"
  60     #
  61     # Note:
  62     # It should be sufficient to run "ctdb setrecmasterrole off"
  63     # on node 0 and wait for election and recovery to finish.
  64     # But there were problems related to this in this automatic
  65     # test, so for now use "ctdb stop" and "ctdb continue".
  66     #
  67     echo "stop node 0"
  68     try_command_on_node 0 $CTDB stop
  69     wait_until_node_has_status 0 stopped
  70     echo "continue node 0"
  71     try_command_on_node 0 $CTDB continue
  72     wait_until_node_has_status 0 notstopped
  73
  74     try_command_on_node any $CTDB recmaster
  75     recmaster="$out"
  76     if [ "$recmaster" = "0" ]; then
  77         echo "failed to move recmaster to different node"
  78         exit 1
  79     fi
  80 fi
  81
  82 echo "Recmaster:$recmaster"
  83
  84 # Create a temporary non-persistent database to test with
  85 echo "create test database $TESTDB"
  86 try_command_on_node $recmaster $CTDB attach $TESTDB
  87
  88 # Wipe Test database
  89 echo "wipe test database"
  90 try_command_on_node $recmaster $CTDB wipedb $TESTDB
  91
  92 # Add a record   key=test1 data=value1
  93 echo "store key(test1) data(value1)"
  94 try_command_on_node $recmaster $CTDB writekey $TESTDB test1 value1
  95
  96 # Fetch a record   key=test1
  97 echo "read key(test1)"
  98 try_command_on_node $recmaster $CTDB readkey $TESTDB test1
  99 echo "$out"
 100
 101 # Do a recovery
 102 echo "force recovery"
 103 try_command_on_node $recmaster $CTDB recover
 104
 105 wait_until_node_has_status $recmaster recovered
 106
 107 # Add a record   key=test1 data=value2
 108 echo "store key(test1) data(value2)"
 109 try_command_on_node $recmaster $CTDB writekey $TESTDB test1 value2
 110
 111 # Fetch a record   key=test1
 112 echo "read key(test1)"
 113 try_command_on_node $recmaster $CTDB readkey $TESTDB test1
 114 echo "$out"
 115
 116 # Do a recovery
 117 echo "force recovery"
 118 try_command_on_node $recmaster $CTDB recover
 119
 120 wait_until_node_has_status $recmaster recovered
 121
 122 # Verify record   key=test1
 123 echo "read key(test1)"
 124 try_command_on_node $recmaster $CTDB readkey $TESTDB test1
 125 echo "$out"
 126 if [ "$out" = "Data: size:6 ptr:[value2]" ]; then
 127         echo "GOOD: Recovery did not corrupt database"
 128 else
 129         echo "BAD: Recovery corrupted database"
 130         status=1
 131 fi
 132
 133 exit $status