3 # linearize-data.py: Construct a linear, no-fork version of the chain.
5 # Copyright (c) 2013-2017 The Bitcoin Core developers
6 # Distributed under the MIT software license, see the accompanying
7 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
10 from __future__
import print_function
, division
19 from collections
import namedtuple
20 from binascii
import hexlify
, unhexlify
24 ##### Switch endian-ness #####
25 def hex_switchEndian(s
):
26 """ Switches the endianness of a hex string (in pairs of hex chars) """
27 pairList
= [s
[i
:i
+2].encode() for i
in range(0, len(s
), 2)]
28 return b
''.join(pairList
[::-1]).decode()
34 return uint32(( ((x
) << 24) |
(((x
) << 8) & 0x00ff0000) |
35 (((x
) >> 8) & 0x0000ff00) |
((x
) >> 24) ))
37 def bufreverse(in_buf
):
39 for i
in range(0, len(in_buf
), 4):
40 word
= struct
.unpack('@I', in_buf
[i
:i
+4])[0]
41 out_words
.append(struct
.pack('@I', bytereverse(word
)))
42 return b
''.join(out_words
)
44 def wordreverse(in_buf
):
46 for i
in range(0, len(in_buf
), 4):
47 out_words
.append(in_buf
[i
:i
+4])
49 return b
''.join(out_words
)
51 def calc_hdr_hash(blk_hdr
):
52 hash1
= hashlib
.sha256()
54 hash1_o
= hash1
.digest()
56 hash2
= hashlib
.sha256()
58 hash2_o
= hash2
.digest()
62 def calc_hash_str(blk_hdr
):
63 hash = calc_hdr_hash(blk_hdr
)
64 hash = bufreverse(hash)
65 hash = wordreverse(hash)
66 hash_str
= hexlify(hash).decode('utf-8')
69 def get_blk_dt(blk_hdr
):
70 members
= struct
.unpack("<I", blk_hdr
[68:68+4])
72 dt
= datetime
.datetime
.fromtimestamp(nTime
)
73 dt_ym
= datetime
.datetime(dt
.year
, dt
.month
, 1)
76 # When getting the list of block hashes, undo any byte reversals.
77 def get_block_hashes(settings
):
79 f
= open(settings
['hashlist'], "r")
82 if settings
['rev_hash_bytes'] == 'true':
83 line
= hex_switchEndian(line
)
86 print("Read " + str(len(blkindex
)) + " hashes")
90 # The block map shouldn't give or receive byte-reversed hashes.
91 def mkblockmap(blkindex
):
93 for height
,hash in enumerate(blkindex
):
97 # Block header and extent on disk
98 BlockExtent
= namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
100 class BlockDataCopier
:
101 def __init__(self
, settings
, blkindex
, blkmap
):
102 self
.settings
= settings
103 self
.blkindex
= blkindex
115 self
.lastDate
= datetime
.datetime(2000, 1, 1)
116 self
.highTS
= 1408893517 - 315360000
117 self
.timestampSplit
= False
118 self
.fileOutput
= True
119 self
.setFileTime
= False
120 self
.maxOutSz
= settings
['max_out_sz']
121 if 'output' in settings
:
122 self
.fileOutput
= False
123 if settings
['file_timestamp'] != 0:
124 self
.setFileTime
= True
125 if settings
['split_timestamp'] != 0:
126 self
.timestampSplit
= True
127 # Extents and cache for out-of-order blocks
128 self
.blockExtents
= {}
129 self
.outOfOrderData
= {}
130 self
.outOfOrderSize
= 0 # running total size for items in outOfOrderData
132 def writeBlock(self
, inhdr
, blk_hdr
, rawblock
):
133 blockSizeOnDisk
= len(inhdr
) + len(blk_hdr
) + len(rawblock
)
134 if not self
.fileOutput
and ((self
.outsz
+ blockSizeOnDisk
) > self
.maxOutSz
):
137 os
.utime(self
.outFname
, (int(time
.time()), self
.highTS
))
140 self
.outFn
= self
.outFn
+ 1
143 (blkDate
, blkTS
) = get_blk_dt(blk_hdr
)
144 if self
.timestampSplit
and (blkDate
> self
.lastDate
):
145 print("New month " + blkDate
.strftime("%Y-%m") + " @ " + self
.hash_str
)
146 self
.lastDate
= blkDate
150 os
.utime(self
.outFname
, (int(time
.time()), self
.highTS
))
153 self
.outFn
= self
.outFn
+ 1
158 self
.outFname
= self
.settings
['output_file']
160 self
.outFname
= os
.path
.join(self
.settings
['output'], "blk%05d.dat" % self
.outFn
)
161 print("Output file " + self
.outFname
)
162 self
.outF
= open(self
.outFname
, "wb")
164 self
.outF
.write(inhdr
)
165 self
.outF
.write(blk_hdr
)
166 self
.outF
.write(rawblock
)
167 self
.outsz
= self
.outsz
+ len(inhdr
) + len(blk_hdr
) + len(rawblock
)
169 self
.blkCountOut
= self
.blkCountOut
+ 1
170 if blkTS
> self
.highTS
:
173 if (self
.blkCountOut
% 1000) == 0:
174 print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
175 (self
.blkCountIn
, self
.blkCountOut
, len(self
.blkindex
), 100.0 * self
.blkCountOut
/ len(self
.blkindex
)))
177 def inFileName(self
, fn
):
178 return os
.path
.join(self
.settings
['input'], "blk%05d.dat" % fn
)
180 def fetchBlock(self
, extent
):
181 '''Fetch block contents from disk given extents'''
182 with
open(self
.inFileName(extent
.fn
), "rb") as f
:
183 f
.seek(extent
.offset
)
184 return f
.read(extent
.size
)
186 def copyOneBlock(self
):
187 '''Find the next block to be written in the input, and copy it to the output.'''
188 extent
= self
.blockExtents
.pop(self
.blkCountOut
)
189 if self
.blkCountOut
in self
.outOfOrderData
:
190 # If the data is cached, use it from memory and remove from the cache
191 rawblock
= self
.outOfOrderData
.pop(self
.blkCountOut
)
192 self
.outOfOrderSize
-= len(rawblock
)
193 else: # Otherwise look up data on disk
194 rawblock
= self
.fetchBlock(extent
)
196 self
.writeBlock(extent
.inhdr
, extent
.blkhdr
, rawblock
)
199 while self
.blkCountOut
< len(self
.blkindex
):
201 fname
= self
.inFileName(self
.inFn
)
202 print("Input file " + fname
)
204 self
.inF
= open(fname
, "rb")
206 print("Premature end of block data")
209 inhdr
= self
.inF
.read(8)
210 if (not inhdr
or (inhdr
[0] == "\0")):
213 self
.inFn
= self
.inFn
+ 1
217 if (inMagic
!= self
.settings
['netmagic']):
218 print("Invalid magic: " + hexlify(inMagic
).decode('utf-8'))
221 su
= struct
.unpack("<I", inLenLE
)
222 inLen
= su
[0] - 80 # length without header
223 blk_hdr
= self
.inF
.read(80)
224 inExtent
= BlockExtent(self
.inFn
, self
.inF
.tell(), inhdr
, blk_hdr
, inLen
)
226 self
.hash_str
= calc_hash_str(blk_hdr
)
227 if not self
.hash_str
in blkmap
:
228 # Because blocks can be written to files out-of-order as of 0.10, the script
229 # may encounter blocks it doesn't know about. Treat as debug output.
230 if settings
['debug_output'] == 'true':
231 print("Skipping unknown block " + self
.hash_str
)
232 self
.inF
.seek(inLen
, os
.SEEK_CUR
)
235 blkHeight
= self
.blkmap
[self
.hash_str
]
238 if self
.blkCountOut
== blkHeight
:
239 # If in-order block, just copy
240 rawblock
= self
.inF
.read(inLen
)
241 self
.writeBlock(inhdr
, blk_hdr
, rawblock
)
243 # See if we can catch up to prior out-of-order blocks
244 while self
.blkCountOut
in self
.blockExtents
:
247 else: # If out-of-order, skip over block data for now
248 self
.blockExtents
[blkHeight
] = inExtent
249 if self
.outOfOrderSize
< self
.settings
['out_of_order_cache_sz']:
250 # If there is space in the cache, read the data
251 # Reading the data in file sequence instead of seeking and fetching it later is preferred,
252 # but we don't want to fill up memory
253 self
.outOfOrderData
[blkHeight
] = self
.inF
.read(inLen
)
254 self
.outOfOrderSize
+= inLen
255 else: # If no space in cache, seek forward
256 self
.inF
.seek(inLen
, os
.SEEK_CUR
)
258 print("Done (%i blocks written)" % (self
.blkCountOut
))
260 if __name__
== '__main__':
261 if len(sys
.argv
) != 2:
262 print("Usage: linearize-data.py CONFIG-FILE")
265 f
= open(sys
.argv
[1])
268 m
= re
.search('^\s*#', line
)
272 # parse key=value lines
273 m
= re
.search('^(\w+)\s*=\s*(\S.*)$', line
)
276 settings
[m
.group(1)] = m
.group(2)
279 # Force hash byte format setting to be lowercase to make comparisons easier.
280 # Also place upfront in case any settings need to know about it.
281 if 'rev_hash_bytes' not in settings
:
282 settings
['rev_hash_bytes'] = 'false'
283 settings
['rev_hash_bytes'] = settings
['rev_hash_bytes'].lower()
285 if 'netmagic' not in settings
:
286 settings
['netmagic'] = 'f9beb4d9'
287 if 'genesis' not in settings
:
288 settings
['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
289 if 'input' not in settings
:
290 settings
['input'] = 'input'
291 if 'hashlist' not in settings
:
292 settings
['hashlist'] = 'hashlist.txt'
293 if 'file_timestamp' not in settings
:
294 settings
['file_timestamp'] = 0
295 if 'split_timestamp' not in settings
:
296 settings
['split_timestamp'] = 0
297 if 'max_out_sz' not in settings
:
298 settings
['max_out_sz'] = 1000 * 1000 * 1000
299 if 'out_of_order_cache_sz' not in settings
:
300 settings
['out_of_order_cache_sz'] = 100 * 1000 * 1000
301 if 'debug_output' not in settings
:
302 settings
['debug_output'] = 'false'
304 settings
['max_out_sz'] = int(settings
['max_out_sz'])
305 settings
['split_timestamp'] = int(settings
['split_timestamp'])
306 settings
['file_timestamp'] = int(settings
['file_timestamp'])
307 settings
['netmagic'] = unhexlify(settings
['netmagic'].encode('utf-8'))
308 settings
['out_of_order_cache_sz'] = int(settings
['out_of_order_cache_sz'])
309 settings
['debug_output'] = settings
['debug_output'].lower()
311 if 'output_file' not in settings
and 'output' not in settings
:
312 print("Missing output file / directory")
315 blkindex
= get_block_hashes(settings
)
316 blkmap
= mkblockmap(blkindex
)
318 # Block hash map won't be byte-reversed. Neither should the genesis hash.
319 if not settings
['genesis'] in blkmap
:
320 print("Genesis block not found in hashlist")
322 BlockDataCopier(settings
, blkindex
, blkmap
).run()