2 # Copyright (c) 2017 The Bitcoin Core developers
3 # Distributed under the MIT software license, see the accompanying
4 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
5 """Test recovery from a crash during chainstate writing.
8 * node0, node1, and node2 will have different dbcrash ratios, and different
10 * node3 will be a regular node, with no crashing.
11 * The nodes will not connect to each other.
13 - use default test framework starting chain. initialize starting_tip_height to
17 * generate lots of transactions on node3, enough to fill up a block.
18 * uniformly randomly pick a tip height from starting_tip_height to
19 tip_height; with probability 1/(height_difference+4), invalidate this block.
20 * mine enough blocks to overtake tip_height at start of loop.
21 * for each node in [node0,node1,node2]:
22 - for each mined block:
23 * submit block to node
24 * if node crashed on/after submitting:
25 - restart until recovery succeeds
26 - check that utxo matches node3 using gettxoutsetinfo"""
34 from test_framework
.mininode
import *
35 from test_framework
.script
import *
36 from test_framework
.test_framework
import BitcoinTestFramework
37 from test_framework
.util
import *
39 HTTP_DISCONNECT_ERRORS
= [http
.client
.CannotSendRequest
]
41 HTTP_DISCONNECT_ERRORS
.append(http
.client
.RemoteDisconnected
)
42 except AttributeError:
45 class ChainstateWriteCrashTest(BitcoinTestFramework
):
46 def set_test_params(self
):
48 self
.setup_clean_chain
= False
50 # Set -maxmempool=0 to turn off mempool memory sharing with dbcache
51 # Set -rpcservertimeout=900 to reduce socket disconnects in this
53 self
.base_args
= ["-limitdescendantsize=0", "-maxmempool=0", "-rpcservertimeout=900", "-dbbatchsize=200000"]
55 # Set different crash ratios and cache sizes. Note that not all of
56 # -dbcache goes to pcoinsTip.
57 self
.node0_args
= ["-dbcrashratio=8", "-dbcache=4"] + self
.base_args
58 self
.node1_args
= ["-dbcrashratio=16", "-dbcache=8"] + self
.base_args
59 self
.node2_args
= ["-dbcrashratio=24", "-dbcache=16"] + self
.base_args
61 # Node3 is a normal node with default args, except will mine full blocks
62 self
.node3_args
= ["-blockmaxweight=4000000"]
63 self
.extra_args
= [self
.node0_args
, self
.node1_args
, self
.node2_args
, self
.node3_args
]
65 def setup_network(self
):
66 # Need a bit of extra time for the nodes to start up for this test
67 self
.add_nodes(self
.num_nodes
, extra_args
=self
.extra_args
, timewait
=90)
69 # Leave them unconnected, we'll use submitblock directly in this test
71 def restart_node(self
, node_index
, expected_tip
):
72 """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash.
74 Exceptions on startup should indicate node crash (due to -dbcrashratio), in which case we try again. Give up
75 after 60 seconds. Returns the utxo hash of the given node."""
77 time_start
= time
.time()
78 while time
.time() - time_start
< 120:
80 # Any of these RPC calls could throw due to node crash
81 self
.start_node(node_index
)
82 self
.nodes
[node_index
].waitforblock(expected_tip
)
83 utxo_hash
= self
.nodes
[node_index
].gettxoutsetinfo()['hash_serialized_2']
86 # An exception here should mean the node is about to crash.
87 # If bitcoind exits, then try again. wait_for_node_exit()
88 # should raise an exception if bitcoind doesn't exit.
89 self
.wait_for_node_exit(node_index
, timeout
=10)
90 self
.crashed_on_restart
+= 1
93 # If we got here, bitcoind isn't coming back up on restart. Could be a
94 # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio --
95 # perhaps we generated a test case that blew up our cache?
96 # TODO: If this happens a lot, we should try to restart without -dbcrashratio
97 # and make sure that recovery happens.
98 raise AssertionError("Unable to successfully restart node %d in allotted time", node_index
)
100 def submit_block_catch_error(self
, node_index
, block
):
101 """Try submitting a block to the given node.
103 Catch any exceptions that indicate the node has crashed.
104 Returns true if the block was submitted successfully; false otherwise."""
107 self
.nodes
[node_index
].submitblock(block
)
109 except http
.client
.BadStatusLine
as e
:
110 # Prior to 3.5 BadStatusLine('') was raised for a remote disconnect error.
111 if sys
.version_info
[0] == 3 and sys
.version_info
[1] < 5 and e
.line
== "''":
112 self
.log
.debug("node %d submitblock raised exception: %s", node_index
, e
)
116 except tuple(HTTP_DISCONNECT_ERRORS
) as e
:
117 self
.log
.debug("node %d submitblock raised exception: %s", node_index
, e
)
120 self
.log
.debug("node %d submitblock raised OSError exception: errno=%s", node_index
, e
.errno
)
121 if e
.errno
in [errno
.EPIPE
, errno
.ECONNREFUSED
, errno
.ECONNRESET
]:
122 # The node has likely crashed
125 # Unexpected exception, raise
128 def sync_node3blocks(self
, block_hashes
):
129 """Use submitblock to sync node3's chain with the other nodes
131 If submitblock fails, restart the node and get the new utxo hash.
132 If any nodes crash while updating, we'll compare utxo hashes to
133 ensure recovery was successful."""
135 node3_utxo_hash
= self
.nodes
[3].gettxoutsetinfo()['hash_serialized_2']
137 # Retrieve all the blocks from node3
139 for block_hash
in block_hashes
:
140 blocks
.append([block_hash
, self
.nodes
[3].getblock(block_hash
, 0)])
142 # Deliver each block to each other node
144 nodei_utxo_hash
= None
145 self
.log
.debug("Syncing blocks to node %d", i
)
146 for (block_hash
, block
) in blocks
:
147 # Get the block from node3, and submit to node_i
148 self
.log
.debug("submitting block %s", block_hash
)
149 if not self
.submit_block_catch_error(i
, block
):
150 # TODO: more carefully check that the crash is due to -dbcrashratio
151 # (change the exit code perhaps, and check that here?)
152 self
.wait_for_node_exit(i
, timeout
=30)
153 self
.log
.debug("Restarting node %d after block hash %s", i
, block_hash
)
154 nodei_utxo_hash
= self
.restart_node(i
, block_hash
)
155 assert nodei_utxo_hash
is not None
156 self
.restart_counts
[i
] += 1
158 # Clear it out after successful submitblock calls -- the cached
159 # utxo hash will no longer be correct
160 nodei_utxo_hash
= None
162 # Check that the utxo hash matches node3's utxo set
163 # NOTE: we only check the utxo set if we had to restart the node
164 # after the last block submitted:
165 # - checking the utxo hash causes a cache flush, which we don't
166 # want to do every time; so
167 # - we only update the utxo cache after a node restart, since flushing
168 # the cache is a no-op at that point
169 if nodei_utxo_hash
is not None:
170 self
.log
.debug("Checking txoutsetinfo matches for node %d", i
)
171 assert_equal(nodei_utxo_hash
, node3_utxo_hash
)
173 def verify_utxo_hash(self
):
174 """Verify that the utxo hash of each node matches node3.
176 Restart any nodes that crash while querying."""
177 node3_utxo_hash
= self
.nodes
[3].gettxoutsetinfo()['hash_serialized_2']
178 self
.log
.info("Verifying utxo hash matches for all nodes")
182 nodei_utxo_hash
= self
.nodes
[i
].gettxoutsetinfo()['hash_serialized_2']
184 # probably a crash on db flushing
185 nodei_utxo_hash
= self
.restart_node(i
, self
.nodes
[3].getbestblockhash())
186 assert_equal(nodei_utxo_hash
, node3_utxo_hash
)
188 def generate_small_transactions(self
, node
, count
, utxo_list
):
189 FEE
= 1000 # TODO: replace this with node relay fee based calculation
191 random
.shuffle(utxo_list
)
192 while len(utxo_list
) >= 2 and num_transactions
< count
:
196 utxo
= utxo_list
.pop()
197 tx
.vin
.append(CTxIn(COutPoint(int(utxo
['txid'], 16), utxo
['vout'])))
198 input_amount
+= int(utxo
['amount'] * COIN
)
199 output_amount
= (input_amount
- FEE
) // 3
201 if output_amount
<= 0:
202 # Sanity check -- if we chose inputs that are too small, skip
206 tx
.vout
.append(CTxOut(output_amount
, hex_str_to_bytes(utxo
['scriptPubKey'])))
208 # Sign and send the transaction to get into the mempool
209 tx_signed_hex
= node
.signrawtransaction(ToHex(tx
))['hex']
210 node
.sendrawtransaction(tx_signed_hex
)
211 num_transactions
+= 1
214 # Track test coverage statistics
215 self
.restart_counts
= [0, 0, 0] # Track the restarts for nodes 0-2
216 self
.crashed_on_restart
= 0 # Track count of crashes during recovery
218 # Start by creating a lot of utxos on node3
219 initial_height
= self
.nodes
[3].getblockcount()
220 utxo_list
= create_confirmed_utxos(self
.nodes
[3].getnetworkinfo()['relayfee'], self
.nodes
[3], 5000)
221 self
.log
.info("Prepped %d utxo entries", len(utxo_list
))
223 # Sync these blocks with the other nodes
224 block_hashes_to_sync
= []
225 for height
in range(initial_height
+ 1, self
.nodes
[3].getblockcount() + 1):
226 block_hashes_to_sync
.append(self
.nodes
[3].getblockhash(height
))
228 self
.log
.debug("Syncing %d blocks with other nodes", len(block_hashes_to_sync
))
229 # Syncing the blocks could cause nodes to crash, so the test begins here.
230 self
.sync_node3blocks(block_hashes_to_sync
)
232 starting_tip_height
= self
.nodes
[3].getblockcount()
235 # each time through the loop, generate a bunch of transactions,
236 # and then either mine a single new block on the tip, or some-sized reorg.
238 self
.log
.info("Iteration %d, generating 2500 transactions %s", i
, self
.restart_counts
)
239 # Generate a bunch of small-ish transactions
240 self
.generate_small_transactions(self
.nodes
[3], 2500, utxo_list
)
241 # Pick a random block between current tip, and starting tip
242 current_height
= self
.nodes
[3].getblockcount()
243 random_height
= random
.randint(starting_tip_height
, current_height
)
244 self
.log
.debug("At height %d, considering height %d", current_height
, random_height
)
245 if random_height
> starting_tip_height
:
246 # Randomly reorg from this point with some probability (1/4 for
247 # tip, 1/5 for tip-1, ...)
248 if random
.random() < 1.0 / (current_height
+ 4 - random_height
):
249 self
.log
.debug("Invalidating block at height %d", random_height
)
250 self
.nodes
[3].invalidateblock(self
.nodes
[3].getblockhash(random_height
))
252 # Now generate new blocks until we pass the old tip height
253 self
.log
.debug("Mining longer tip")
255 while current_height
+ 1 > self
.nodes
[3].getblockcount():
256 block_hashes
.extend(self
.nodes
[3].generate(min(10, current_height
+ 1 - self
.nodes
[3].getblockcount())))
257 self
.log
.debug("Syncing %d new blocks...", len(block_hashes
))
258 self
.sync_node3blocks(block_hashes
)
259 utxo_list
= self
.nodes
[3].listunspent()
260 self
.log
.debug("Node3 utxo count: %d", len(utxo_list
))
262 # Check that the utxo hashes agree with node3
263 # Useful side effect: each utxo cache gets flushed here, so that we
264 # won't get crashes on shutdown at the end of the test.
265 self
.verify_utxo_hash()
267 # Check the test coverage
268 self
.log
.info("Restarted nodes: %s; crashes on restart: %d", self
.restart_counts
, self
.crashed_on_restart
)
270 # If no nodes were restarted, we didn't test anything.
271 assert self
.restart_counts
!= [0, 0, 0]
273 # Make sure we tested the case of crash-during-recovery.
274 assert self
.crashed_on_restart
> 0
276 # Warn if any of the nodes escaped restart.
278 if self
.restart_counts
[i
] == 0:
279 self
.log
.warn("Node %d never crashed during utxo flush!", i
)
281 if __name__
== "__main__":
282 ChainstateWriteCrashTest().main()