test/functional/dbcrash.py

   1 #!/usr/bin/env python3
   2 # Copyright (c) 2017 The Bitcoin Core developers
   3 # Distributed under the MIT software license, see the accompanying
   4 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
   5 """Test recovery from a crash during chainstate writing.
   6
   7 - 4 nodes
   8   * node0, node1, and node2 will have different dbcrash ratios, and different
   9     dbcache sizes
  10   * node3 will be a regular node, with no crashing.
  11   * The nodes will not connect to each other.
  12
  13 - use default test framework starting chain. initialize starting_tip_height to
  14   tip height.
  15
  16 - Main loop:
  17   * generate lots of transactions on node3, enough to fill up a block.
  18   * uniformly randomly pick a tip height from starting_tip_height to
  19     tip_height; with probability 1/(height_difference+4), invalidate this block.
  20   * mine enough blocks to overtake tip_height at start of loop.
  21   * for each node in [node0,node1,node2]:
  22      - for each mined block:
  23        * submit block to node
  24        * if node crashed on/after submitting:
  25          - restart until recovery succeeds
  26          - check that utxo matches node3 using gettxoutsetinfo"""
  27
  28 import errno
  29 import http.client
  30 import random
  31 import sys
  32 import time
  33
  34 from test_framework.mininode import *
  35 from test_framework.script import *
  36 from test_framework.test_framework import BitcoinTestFramework
  37 from test_framework.util import *
  38
  39 HTTP_DISCONNECT_ERRORS = [http.client.CannotSendRequest]
  40 try:
  41     HTTP_DISCONNECT_ERRORS.append(http.client.RemoteDisconnected)
  42 except AttributeError:
  43     pass
  44
  45 class ChainstateWriteCrashTest(BitcoinTestFramework):
  46     def set_test_params(self):
  47         self.num_nodes = 4
  48         self.setup_clean_chain = False
  49
  50         # Set -maxmempool=0 to turn off mempool memory sharing with dbcache
  51         # Set -rpcservertimeout=900 to reduce socket disconnects in this
  52         # long-running test
  53         self.base_args = ["-limitdescendantsize=0", "-maxmempool=0", "-rpcservertimeout=900", "-dbbatchsize=200000"]
  54
  55         # Set different crash ratios and cache sizes.  Note that not all of
  56         # -dbcache goes to pcoinsTip.
  57         self.node0_args = ["-dbcrashratio=8", "-dbcache=4"] + self.base_args
  58         self.node1_args = ["-dbcrashratio=16", "-dbcache=8"] + self.base_args
  59         self.node2_args = ["-dbcrashratio=24", "-dbcache=16"] + self.base_args
  60
  61         # Node3 is a normal node with default args, except will mine full blocks
  62         self.node3_args = ["-blockmaxweight=4000000"]
  63         self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args]
  64
  65     def setup_network(self):
  66         # Need a bit of extra time for the nodes to start up for this test
  67         self.add_nodes(self.num_nodes, extra_args=self.extra_args, timewait=90)
  68         self.start_nodes()
  69         # Leave them unconnected, we'll use submitblock directly in this test
  70
  71     def restart_node(self, node_index, expected_tip):
  72         """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash.
  73
  74         Exceptions on startup should indicate node crash (due to -dbcrashratio), in which case we try again. Give up
  75         after 60 seconds. Returns the utxo hash of the given node."""
  76
  77         time_start = time.time()
  78         while time.time() - time_start < 120:
  79             try:
  80                 # Any of these RPC calls could throw due to node crash
  81                 self.start_node(node_index)
  82                 self.nodes[node_index].waitforblock(expected_tip)
  83                 utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_2']
  84                 return utxo_hash
  85             except:
  86                 # An exception here should mean the node is about to crash.
  87                 # If bitcoind exits, then try again.  wait_for_node_exit()
  88                 # should raise an exception if bitcoind doesn't exit.
  89                 self.wait_for_node_exit(node_index, timeout=10)
  90             self.crashed_on_restart += 1
  91             time.sleep(1)
  92
  93         # If we got here, bitcoind isn't coming back up on restart.  Could be a
  94         # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio --
  95         # perhaps we generated a test case that blew up our cache?
  96         # TODO: If this happens a lot, we should try to restart without -dbcrashratio
  97         # and make sure that recovery happens.
  98         raise AssertionError("Unable to successfully restart node %d in allotted time", node_index)
  99
 100     def submit_block_catch_error(self, node_index, block):
 101         """Try submitting a block to the given node.
 102
 103         Catch any exceptions that indicate the node has crashed.
 104         Returns true if the block was submitted successfully; false otherwise."""
 105
 106         try:
 107             self.nodes[node_index].submitblock(block)
 108             return True
 109         except http.client.BadStatusLine as e:
 110             # Prior to 3.5 BadStatusLine('') was raised for a remote disconnect error.
 111             if sys.version_info[0] == 3 and sys.version_info[1] < 5 and e.line == "''":
 112                 self.log.debug("node %d submitblock raised exception: %s", node_index, e)
 113                 return False
 114             else:
 115                 raise
 116         except tuple(HTTP_DISCONNECT_ERRORS) as e:
 117             self.log.debug("node %d submitblock raised exception: %s", node_index, e)
 118             return False
 119         except OSError as e:
 120             self.log.debug("node %d submitblock raised OSError exception: errno=%s", node_index, e.errno)
 121             if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]:
 122                 # The node has likely crashed
 123                 return False
 124             else:
 125                 # Unexpected exception, raise
 126                 raise
 127
 128     def sync_node3blocks(self, block_hashes):
 129         """Use submitblock to sync node3's chain with the other nodes
 130
 131         If submitblock fails, restart the node and get the new utxo hash.
 132         If any nodes crash while updating, we'll compare utxo hashes to
 133         ensure recovery was successful."""
 134
 135         node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2']
 136
 137         # Retrieve all the blocks from node3
 138         blocks = []
 139         for block_hash in block_hashes:
 140             blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)])
 141
 142         # Deliver each block to each other node
 143         for i in range(3):
 144             nodei_utxo_hash = None
 145             self.log.debug("Syncing blocks to node %d", i)
 146             for (block_hash, block) in blocks:
 147                 # Get the block from node3, and submit to node_i
 148                 self.log.debug("submitting block %s", block_hash)
 149                 if not self.submit_block_catch_error(i, block):
 150                     # TODO: more carefully check that the crash is due to -dbcrashratio
 151                     # (change the exit code perhaps, and check that here?)
 152                     self.wait_for_node_exit(i, timeout=30)
 153                     self.log.debug("Restarting node %d after block hash %s", i, block_hash)
 154                     nodei_utxo_hash = self.restart_node(i, block_hash)
 155                     assert nodei_utxo_hash is not None
 156                     self.restart_counts[i] += 1
 157                 else:
 158                     # Clear it out after successful submitblock calls -- the cached
 159                     # utxo hash will no longer be correct
 160                     nodei_utxo_hash = None
 161
 162             # Check that the utxo hash matches node3's utxo set
 163             # NOTE: we only check the utxo set if we had to restart the node
 164             # after the last block submitted:
 165             # - checking the utxo hash causes a cache flush, which we don't
 166             # want to do every time; so
 167             # - we only update the utxo cache after a node restart, since flushing
 168             # the cache is a no-op at that point
 169             if nodei_utxo_hash is not None:
 170                 self.log.debug("Checking txoutsetinfo matches for node %d", i)
 171                 assert_equal(nodei_utxo_hash, node3_utxo_hash)
 172
 173     def verify_utxo_hash(self):
 174         """Verify that the utxo hash of each node matches node3.
 175
 176         Restart any nodes that crash while querying."""
 177         node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2']
 178         self.log.info("Verifying utxo hash matches for all nodes")
 179
 180         for i in range(3):
 181             try:
 182                 nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_2']
 183             except OSError:
 184                 # probably a crash on db flushing
 185                 nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash())
 186             assert_equal(nodei_utxo_hash, node3_utxo_hash)
 187
 188     def generate_small_transactions(self, node, count, utxo_list):
 189         FEE = 1000  # TODO: replace this with node relay fee based calculation
 190         num_transactions = 0
 191         random.shuffle(utxo_list)
 192         while len(utxo_list) >= 2 and num_transactions < count:
 193             tx = CTransaction()
 194             input_amount = 0
 195             for i in range(2):
 196                 utxo = utxo_list.pop()
 197                 tx.vin.append(CTxIn(COutPoint(int(utxo['txid'], 16), utxo['vout'])))
 198                 input_amount += int(utxo['amount'] * COIN)
 199             output_amount = (input_amount - FEE) // 3
 200
 201             if output_amount <= 0:
 202                 # Sanity check -- if we chose inputs that are too small, skip
 203                 continue
 204
 205             for i in range(3):
 206                 tx.vout.append(CTxOut(output_amount, hex_str_to_bytes(utxo['scriptPubKey'])))
 207
 208             # Sign and send the transaction to get into the mempool
 209             tx_signed_hex = node.signrawtransaction(ToHex(tx))['hex']
 210             node.sendrawtransaction(tx_signed_hex)
 211             num_transactions += 1
 212
 213     def run_test(self):
 214         # Track test coverage statistics
 215         self.restart_counts = [0, 0, 0]  # Track the restarts for nodes 0-2
 216         self.crashed_on_restart = 0      # Track count of crashes during recovery
 217
 218         # Start by creating a lot of utxos on node3
 219         initial_height = self.nodes[3].getblockcount()
 220         utxo_list = create_confirmed_utxos(self.nodes[3].getnetworkinfo()['relayfee'], self.nodes[3], 5000)
 221         self.log.info("Prepped %d utxo entries", len(utxo_list))
 222
 223         # Sync these blocks with the other nodes
 224         block_hashes_to_sync = []
 225         for height in range(initial_height + 1, self.nodes[3].getblockcount() + 1):
 226             block_hashes_to_sync.append(self.nodes[3].getblockhash(height))
 227
 228         self.log.debug("Syncing %d blocks with other nodes", len(block_hashes_to_sync))
 229         # Syncing the blocks could cause nodes to crash, so the test begins here.
 230         self.sync_node3blocks(block_hashes_to_sync)
 231
 232         starting_tip_height = self.nodes[3].getblockcount()
 233
 234         # Main test loop:
 235         # each time through the loop, generate a bunch of transactions,
 236         # and then either mine a single new block on the tip, or some-sized reorg.
 237         for i in range(40):
 238             self.log.info("Iteration %d, generating 2500 transactions %s", i, self.restart_counts)
 239             # Generate a bunch of small-ish transactions
 240             self.generate_small_transactions(self.nodes[3], 2500, utxo_list)
 241             # Pick a random block between current tip, and starting tip
 242             current_height = self.nodes[3].getblockcount()
 243             random_height = random.randint(starting_tip_height, current_height)
 244             self.log.debug("At height %d, considering height %d", current_height, random_height)
 245             if random_height > starting_tip_height:
 246                 # Randomly reorg from this point with some probability (1/4 for
 247                 # tip, 1/5 for tip-1, ...)
 248                 if random.random() < 1.0 / (current_height + 4 - random_height):
 249                     self.log.debug("Invalidating block at height %d", random_height)
 250                     self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height))
 251
 252             # Now generate new blocks until we pass the old tip height
 253             self.log.debug("Mining longer tip")
 254             block_hashes = []
 255             while current_height + 1 > self.nodes[3].getblockcount():
 256                 block_hashes.extend(self.nodes[3].generate(min(10, current_height + 1 - self.nodes[3].getblockcount())))
 257             self.log.debug("Syncing %d new blocks...", len(block_hashes))
 258             self.sync_node3blocks(block_hashes)
 259             utxo_list = self.nodes[3].listunspent()
 260             self.log.debug("Node3 utxo count: %d", len(utxo_list))
 261
 262         # Check that the utxo hashes agree with node3
 263         # Useful side effect: each utxo cache gets flushed here, so that we
 264         # won't get crashes on shutdown at the end of the test.
 265         self.verify_utxo_hash()
 266
 267         # Check the test coverage
 268         self.log.info("Restarted nodes: %s; crashes on restart: %d", self.restart_counts, self.crashed_on_restart)
 269
 270         # If no nodes were restarted, we didn't test anything.
 271         assert self.restart_counts != [0, 0, 0]
 272
 273         # Make sure we tested the case of crash-during-recovery.
 274         assert self.crashed_on_restart > 0
 275
 276         # Warn if any of the nodes escaped restart.
 277         for i in range(3):
 278             if self.restart_counts[i] == 0:
 279                 self.log.warn("Node %d never crashed during utxo flush!", i)
 280
 281 if __name__ == "__main__":
 282     ChainstateWriteCrashTest().main()