Merge #11726: Cleanups + nit fixes for walletdir PR
[bitcoinplatinum.git] / test / functional / dbcrash.py
blob24b9765b4e3773ab8830f7950da1adbb15fad601
1 #!/usr/bin/env python3
2 # Copyright (c) 2017 The Bitcoin Core developers
3 # Distributed under the MIT software license, see the accompanying
4 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
5 """Test recovery from a crash during chainstate writing.
7 - 4 nodes
8 * node0, node1, and node2 will have different dbcrash ratios, and different
9 dbcache sizes
10 * node3 will be a regular node, with no crashing.
11 * The nodes will not connect to each other.
13 - use default test framework starting chain. initialize starting_tip_height to
14 tip height.
16 - Main loop:
17 * generate lots of transactions on node3, enough to fill up a block.
18 * uniformly randomly pick a tip height from starting_tip_height to
19 tip_height; with probability 1/(height_difference+4), invalidate this block.
20 * mine enough blocks to overtake tip_height at start of loop.
21 * for each node in [node0,node1,node2]:
22 - for each mined block:
23 * submit block to node
24 * if node crashed on/after submitting:
25 - restart until recovery succeeds
26 - check that utxo matches node3 using gettxoutsetinfo"""
28 import errno
29 import http.client
30 import random
31 import sys
32 import time
34 from test_framework.mininode import *
35 from test_framework.script import *
36 from test_framework.test_framework import BitcoinTestFramework
37 from test_framework.util import *
39 HTTP_DISCONNECT_ERRORS = [http.client.CannotSendRequest]
40 try:
41 HTTP_DISCONNECT_ERRORS.append(http.client.RemoteDisconnected)
42 except AttributeError:
43 pass
45 class ChainstateWriteCrashTest(BitcoinTestFramework):
46 def set_test_params(self):
47 self.num_nodes = 4
48 self.setup_clean_chain = False
50 # Set -maxmempool=0 to turn off mempool memory sharing with dbcache
51 # Set -rpcservertimeout=900 to reduce socket disconnects in this
52 # long-running test
53 self.base_args = ["-limitdescendantsize=0", "-maxmempool=0", "-rpcservertimeout=900", "-dbbatchsize=200000"]
55 # Set different crash ratios and cache sizes. Note that not all of
56 # -dbcache goes to pcoinsTip.
57 self.node0_args = ["-dbcrashratio=8", "-dbcache=4"] + self.base_args
58 self.node1_args = ["-dbcrashratio=16", "-dbcache=8"] + self.base_args
59 self.node2_args = ["-dbcrashratio=24", "-dbcache=16"] + self.base_args
61 # Node3 is a normal node with default args, except will mine full blocks
62 self.node3_args = ["-blockmaxweight=4000000"]
63 self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args]
65 def setup_network(self):
66 # Need a bit of extra time for the nodes to start up for this test
67 self.add_nodes(self.num_nodes, extra_args=self.extra_args, timewait=90)
68 self.start_nodes()
69 # Leave them unconnected, we'll use submitblock directly in this test
71 def restart_node(self, node_index, expected_tip):
72 """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash.
74 Exceptions on startup should indicate node crash (due to -dbcrashratio), in which case we try again. Give up
75 after 60 seconds. Returns the utxo hash of the given node."""
77 time_start = time.time()
78 while time.time() - time_start < 120:
79 try:
80 # Any of these RPC calls could throw due to node crash
81 self.start_node(node_index)
82 self.nodes[node_index].waitforblock(expected_tip)
83 utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_2']
84 return utxo_hash
85 except:
86 # An exception here should mean the node is about to crash.
87 # If bitcoind exits, then try again. wait_for_node_exit()
88 # should raise an exception if bitcoind doesn't exit.
89 self.wait_for_node_exit(node_index, timeout=10)
90 self.crashed_on_restart += 1
91 time.sleep(1)
93 # If we got here, bitcoind isn't coming back up on restart. Could be a
94 # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio --
95 # perhaps we generated a test case that blew up our cache?
96 # TODO: If this happens a lot, we should try to restart without -dbcrashratio
97 # and make sure that recovery happens.
98 raise AssertionError("Unable to successfully restart node %d in allotted time", node_index)
100 def submit_block_catch_error(self, node_index, block):
101 """Try submitting a block to the given node.
103 Catch any exceptions that indicate the node has crashed.
104 Returns true if the block was submitted successfully; false otherwise."""
106 try:
107 self.nodes[node_index].submitblock(block)
108 return True
109 except http.client.BadStatusLine as e:
110 # Prior to 3.5 BadStatusLine('') was raised for a remote disconnect error.
111 if sys.version_info[0] == 3 and sys.version_info[1] < 5 and e.line == "''":
112 self.log.debug("node %d submitblock raised exception: %s", node_index, e)
113 return False
114 else:
115 raise
116 except tuple(HTTP_DISCONNECT_ERRORS) as e:
117 self.log.debug("node %d submitblock raised exception: %s", node_index, e)
118 return False
119 except OSError as e:
120 self.log.debug("node %d submitblock raised OSError exception: errno=%s", node_index, e.errno)
121 if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]:
122 # The node has likely crashed
123 return False
124 else:
125 # Unexpected exception, raise
126 raise
128 def sync_node3blocks(self, block_hashes):
129 """Use submitblock to sync node3's chain with the other nodes
131 If submitblock fails, restart the node and get the new utxo hash.
132 If any nodes crash while updating, we'll compare utxo hashes to
133 ensure recovery was successful."""
135 node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2']
137 # Retrieve all the blocks from node3
138 blocks = []
139 for block_hash in block_hashes:
140 blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)])
142 # Deliver each block to each other node
143 for i in range(3):
144 nodei_utxo_hash = None
145 self.log.debug("Syncing blocks to node %d", i)
146 for (block_hash, block) in blocks:
147 # Get the block from node3, and submit to node_i
148 self.log.debug("submitting block %s", block_hash)
149 if not self.submit_block_catch_error(i, block):
150 # TODO: more carefully check that the crash is due to -dbcrashratio
151 # (change the exit code perhaps, and check that here?)
152 self.wait_for_node_exit(i, timeout=30)
153 self.log.debug("Restarting node %d after block hash %s", i, block_hash)
154 nodei_utxo_hash = self.restart_node(i, block_hash)
155 assert nodei_utxo_hash is not None
156 self.restart_counts[i] += 1
157 else:
158 # Clear it out after successful submitblock calls -- the cached
159 # utxo hash will no longer be correct
160 nodei_utxo_hash = None
162 # Check that the utxo hash matches node3's utxo set
163 # NOTE: we only check the utxo set if we had to restart the node
164 # after the last block submitted:
165 # - checking the utxo hash causes a cache flush, which we don't
166 # want to do every time; so
167 # - we only update the utxo cache after a node restart, since flushing
168 # the cache is a no-op at that point
169 if nodei_utxo_hash is not None:
170 self.log.debug("Checking txoutsetinfo matches for node %d", i)
171 assert_equal(nodei_utxo_hash, node3_utxo_hash)
173 def verify_utxo_hash(self):
174 """Verify that the utxo hash of each node matches node3.
176 Restart any nodes that crash while querying."""
177 node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2']
178 self.log.info("Verifying utxo hash matches for all nodes")
180 for i in range(3):
181 try:
182 nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_2']
183 except OSError:
184 # probably a crash on db flushing
185 nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash())
186 assert_equal(nodei_utxo_hash, node3_utxo_hash)
188 def generate_small_transactions(self, node, count, utxo_list):
189 FEE = 1000 # TODO: replace this with node relay fee based calculation
190 num_transactions = 0
191 random.shuffle(utxo_list)
192 while len(utxo_list) >= 2 and num_transactions < count:
193 tx = CTransaction()
194 input_amount = 0
195 for i in range(2):
196 utxo = utxo_list.pop()
197 tx.vin.append(CTxIn(COutPoint(int(utxo['txid'], 16), utxo['vout'])))
198 input_amount += int(utxo['amount'] * COIN)
199 output_amount = (input_amount - FEE) // 3
201 if output_amount <= 0:
202 # Sanity check -- if we chose inputs that are too small, skip
203 continue
205 for i in range(3):
206 tx.vout.append(CTxOut(output_amount, hex_str_to_bytes(utxo['scriptPubKey'])))
208 # Sign and send the transaction to get into the mempool
209 tx_signed_hex = node.signrawtransaction(ToHex(tx))['hex']
210 node.sendrawtransaction(tx_signed_hex)
211 num_transactions += 1
213 def run_test(self):
214 # Track test coverage statistics
215 self.restart_counts = [0, 0, 0] # Track the restarts for nodes 0-2
216 self.crashed_on_restart = 0 # Track count of crashes during recovery
218 # Start by creating a lot of utxos on node3
219 initial_height = self.nodes[3].getblockcount()
220 utxo_list = create_confirmed_utxos(self.nodes[3].getnetworkinfo()['relayfee'], self.nodes[3], 5000)
221 self.log.info("Prepped %d utxo entries", len(utxo_list))
223 # Sync these blocks with the other nodes
224 block_hashes_to_sync = []
225 for height in range(initial_height + 1, self.nodes[3].getblockcount() + 1):
226 block_hashes_to_sync.append(self.nodes[3].getblockhash(height))
228 self.log.debug("Syncing %d blocks with other nodes", len(block_hashes_to_sync))
229 # Syncing the blocks could cause nodes to crash, so the test begins here.
230 self.sync_node3blocks(block_hashes_to_sync)
232 starting_tip_height = self.nodes[3].getblockcount()
234 # Main test loop:
235 # each time through the loop, generate a bunch of transactions,
236 # and then either mine a single new block on the tip, or some-sized reorg.
237 for i in range(40):
238 self.log.info("Iteration %d, generating 2500 transactions %s", i, self.restart_counts)
239 # Generate a bunch of small-ish transactions
240 self.generate_small_transactions(self.nodes[3], 2500, utxo_list)
241 # Pick a random block between current tip, and starting tip
242 current_height = self.nodes[3].getblockcount()
243 random_height = random.randint(starting_tip_height, current_height)
244 self.log.debug("At height %d, considering height %d", current_height, random_height)
245 if random_height > starting_tip_height:
246 # Randomly reorg from this point with some probability (1/4 for
247 # tip, 1/5 for tip-1, ...)
248 if random.random() < 1.0 / (current_height + 4 - random_height):
249 self.log.debug("Invalidating block at height %d", random_height)
250 self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height))
252 # Now generate new blocks until we pass the old tip height
253 self.log.debug("Mining longer tip")
254 block_hashes = []
255 while current_height + 1 > self.nodes[3].getblockcount():
256 block_hashes.extend(self.nodes[3].generate(min(10, current_height + 1 - self.nodes[3].getblockcount())))
257 self.log.debug("Syncing %d new blocks...", len(block_hashes))
258 self.sync_node3blocks(block_hashes)
259 utxo_list = self.nodes[3].listunspent()
260 self.log.debug("Node3 utxo count: %d", len(utxo_list))
262 # Check that the utxo hashes agree with node3
263 # Useful side effect: each utxo cache gets flushed here, so that we
264 # won't get crashes on shutdown at the end of the test.
265 self.verify_utxo_hash()
267 # Check the test coverage
268 self.log.info("Restarted nodes: %s; crashes on restart: %d", self.restart_counts, self.crashed_on_restart)
270 # If no nodes were restarted, we didn't test anything.
271 assert self.restart_counts != [0, 0, 0]
273 # Make sure we tested the case of crash-during-recovery.
274 assert self.crashed_on_restart > 0
276 # Warn if any of the nodes escaped restart.
277 for i in range(3):
278 if self.restart_counts[i] == 0:
279 self.log.warn("Node %d never crashed during utxo flush!", i)
281 if __name__ == "__main__":
282 ChainstateWriteCrashTest().main()