Avoid reference to undefined name: stderr does not exist, sys.stderr does
[bitcoinplatinum.git] / contrib / linearize / linearize-data.py
blobafcec2b60ae4f5d4ceae805bc076dbf39c982c62
1 #!/usr/bin/env python3
3 # linearize-data.py: Construct a linear, no-fork version of the chain.
5 # Copyright (c) 2013-2016 The Bitcoin Core developers
6 # Distributed under the MIT software license, see the accompanying
7 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
10 from __future__ import print_function, division
11 import struct
12 import re
13 import os
14 import os.path
15 import sys
16 import hashlib
17 import datetime
18 import time
19 from collections import namedtuple
20 from binascii import hexlify, unhexlify
22 settings = {}
24 ##### Switch endian-ness #####
25 def hex_switchEndian(s):
26 """ Switches the endianness of a hex string (in pairs of hex chars) """
27 pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)]
28 return b''.join(pairList[::-1]).decode()
30 def uint32(x):
31 return x & 0xffffffff
33 def bytereverse(x):
34 return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
35 (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
37 def bufreverse(in_buf):
38 out_words = []
39 for i in range(0, len(in_buf), 4):
40 word = struct.unpack('@I', in_buf[i:i+4])[0]
41 out_words.append(struct.pack('@I', bytereverse(word)))
42 return b''.join(out_words)
44 def wordreverse(in_buf):
45 out_words = []
46 for i in range(0, len(in_buf), 4):
47 out_words.append(in_buf[i:i+4])
48 out_words.reverse()
49 return b''.join(out_words)
51 def calc_hdr_hash(blk_hdr):
52 hash1 = hashlib.sha256()
53 hash1.update(blk_hdr)
54 hash1_o = hash1.digest()
56 hash2 = hashlib.sha256()
57 hash2.update(hash1_o)
58 hash2_o = hash2.digest()
60 return hash2_o
62 def calc_hash_str(blk_hdr):
63 hash = calc_hdr_hash(blk_hdr)
64 hash = bufreverse(hash)
65 hash = wordreverse(hash)
66 hash_str = hexlify(hash).decode('utf-8')
67 return hash_str
69 def get_blk_dt(blk_hdr):
70 members = struct.unpack("<I", blk_hdr[68:68+4])
71 nTime = members[0]
72 dt = datetime.datetime.fromtimestamp(nTime)
73 dt_ym = datetime.datetime(dt.year, dt.month, 1)
74 return (dt_ym, nTime)
76 # When getting the list of block hashes, undo any byte reversals.
77 def get_block_hashes(settings):
78 blkindex = []
79 f = open(settings['hashlist'], "r")
80 for line in f:
81 line = line.rstrip()
82 if settings['rev_hash_bytes'] == 'true':
83 line = hex_switchEndian(line)
84 blkindex.append(line)
86 print("Read " + str(len(blkindex)) + " hashes")
88 return blkindex
90 # The block map shouldn't give or receive byte-reversed hashes.
91 def mkblockmap(blkindex):
92 blkmap = {}
93 for height,hash in enumerate(blkindex):
94 blkmap[hash] = height
95 return blkmap
97 # Block header and extent on disk
98 BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
100 class BlockDataCopier:
101 def __init__(self, settings, blkindex, blkmap):
102 self.settings = settings
103 self.blkindex = blkindex
104 self.blkmap = blkmap
106 self.inFn = 0
107 self.inF = None
108 self.outFn = 0
109 self.outsz = 0
110 self.outF = None
111 self.outFname = None
112 self.blkCountIn = 0
113 self.blkCountOut = 0
115 self.lastDate = datetime.datetime(2000, 1, 1)
116 self.highTS = 1408893517 - 315360000
117 self.timestampSplit = False
118 self.fileOutput = True
119 self.setFileTime = False
120 self.maxOutSz = settings['max_out_sz']
121 if 'output' in settings:
122 self.fileOutput = False
123 if settings['file_timestamp'] != 0:
124 self.setFileTime = True
125 if settings['split_timestamp'] != 0:
126 self.timestampSplit = True
127 # Extents and cache for out-of-order blocks
128 self.blockExtents = {}
129 self.outOfOrderData = {}
130 self.outOfOrderSize = 0 # running total size for items in outOfOrderData
132 def writeBlock(self, inhdr, blk_hdr, rawblock):
133 blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
134 if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
135 self.outF.close()
136 if self.setFileTime:
137 os.utime(self.outFname, (int(time.time()), self.highTS))
138 self.outF = None
139 self.outFname = None
140 self.outFn = self.outFn + 1
141 self.outsz = 0
143 (blkDate, blkTS) = get_blk_dt(blk_hdr)
144 if self.timestampSplit and (blkDate > self.lastDate):
145 print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
146 self.lastDate = blkDate
147 if self.outF:
148 self.outF.close()
149 if self.setFileTime:
150 os.utime(self.outFname, (int(time.time()), self.highTS))
151 self.outF = None
152 self.outFname = None
153 self.outFn = self.outFn + 1
154 self.outsz = 0
156 if not self.outF:
157 if self.fileOutput:
158 self.outFname = self.settings['output_file']
159 else:
160 self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn)
161 print("Output file " + self.outFname)
162 self.outF = open(self.outFname, "wb")
164 self.outF.write(inhdr)
165 self.outF.write(blk_hdr)
166 self.outF.write(rawblock)
167 self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
169 self.blkCountOut = self.blkCountOut + 1
170 if blkTS > self.highTS:
171 self.highTS = blkTS
173 if (self.blkCountOut % 1000) == 0:
174 print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
175 (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
177 def inFileName(self, fn):
178 return os.path.join(self.settings['input'], "blk%05d.dat" % fn)
180 def fetchBlock(self, extent):
181 '''Fetch block contents from disk given extents'''
182 with open(self.inFileName(extent.fn), "rb") as f:
183 f.seek(extent.offset)
184 return f.read(extent.size)
186 def copyOneBlock(self):
187 '''Find the next block to be written in the input, and copy it to the output.'''
188 extent = self.blockExtents.pop(self.blkCountOut)
189 if self.blkCountOut in self.outOfOrderData:
190 # If the data is cached, use it from memory and remove from the cache
191 rawblock = self.outOfOrderData.pop(self.blkCountOut)
192 self.outOfOrderSize -= len(rawblock)
193 else: # Otherwise look up data on disk
194 rawblock = self.fetchBlock(extent)
196 self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
198 def run(self):
199 while self.blkCountOut < len(self.blkindex):
200 if not self.inF:
201 fname = self.inFileName(self.inFn)
202 print("Input file " + fname)
203 try:
204 self.inF = open(fname, "rb")
205 except IOError:
206 print("Premature end of block data")
207 return
209 inhdr = self.inF.read(8)
210 if (not inhdr or (inhdr[0] == "\0")):
211 self.inF.close()
212 self.inF = None
213 self.inFn = self.inFn + 1
214 continue
216 inMagic = inhdr[:4]
217 if (inMagic != self.settings['netmagic']):
218 print("Invalid magic: " + hexlify(inMagic).decode('utf-8'))
219 return
220 inLenLE = inhdr[4:]
221 su = struct.unpack("<I", inLenLE)
222 inLen = su[0] - 80 # length without header
223 blk_hdr = self.inF.read(80)
224 inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
226 self.hash_str = calc_hash_str(blk_hdr)
227 if not self.hash_str in blkmap:
228 # Because blocks can be written to files out-of-order as of 0.10, the script
229 # may encounter blocks it doesn't know about. Treat as debug output.
230 if settings['debug_output'] == 'true':
231 print("Skipping unknown block " + self.hash_str)
232 self.inF.seek(inLen, os.SEEK_CUR)
233 continue
235 blkHeight = self.blkmap[self.hash_str]
236 self.blkCountIn += 1
238 if self.blkCountOut == blkHeight:
239 # If in-order block, just copy
240 rawblock = self.inF.read(inLen)
241 self.writeBlock(inhdr, blk_hdr, rawblock)
243 # See if we can catch up to prior out-of-order blocks
244 while self.blkCountOut in self.blockExtents:
245 self.copyOneBlock()
247 else: # If out-of-order, skip over block data for now
248 self.blockExtents[blkHeight] = inExtent
249 if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
250 # If there is space in the cache, read the data
251 # Reading the data in file sequence instead of seeking and fetching it later is preferred,
252 # but we don't want to fill up memory
253 self.outOfOrderData[blkHeight] = self.inF.read(inLen)
254 self.outOfOrderSize += inLen
255 else: # If no space in cache, seek forward
256 self.inF.seek(inLen, os.SEEK_CUR)
258 print("Done (%i blocks written)" % (self.blkCountOut))
260 if __name__ == '__main__':
261 if len(sys.argv) != 2:
262 print("Usage: linearize-data.py CONFIG-FILE")
263 sys.exit(1)
265 f = open(sys.argv[1])
266 for line in f:
267 # skip comment lines
268 m = re.search('^\s*#', line)
269 if m:
270 continue
272 # parse key=value lines
273 m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
274 if m is None:
275 continue
276 settings[m.group(1)] = m.group(2)
277 f.close()
279 # Force hash byte format setting to be lowercase to make comparisons easier.
280 # Also place upfront in case any settings need to know about it.
281 if 'rev_hash_bytes' not in settings:
282 settings['rev_hash_bytes'] = 'false'
283 settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
285 if 'netmagic' not in settings:
286 settings['netmagic'] = 'f9beb4d9'
287 if 'genesis' not in settings:
288 settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
289 if 'input' not in settings:
290 settings['input'] = 'input'
291 if 'hashlist' not in settings:
292 settings['hashlist'] = 'hashlist.txt'
293 if 'file_timestamp' not in settings:
294 settings['file_timestamp'] = 0
295 if 'split_timestamp' not in settings:
296 settings['split_timestamp'] = 0
297 if 'max_out_sz' not in settings:
298 settings['max_out_sz'] = 1000 * 1000 * 1000
299 if 'out_of_order_cache_sz' not in settings:
300 settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
301 if 'debug_output' not in settings:
302 settings['debug_output'] = 'false'
304 settings['max_out_sz'] = int(settings['max_out_sz'])
305 settings['split_timestamp'] = int(settings['split_timestamp'])
306 settings['file_timestamp'] = int(settings['file_timestamp'])
307 settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8'))
308 settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
309 settings['debug_output'] = settings['debug_output'].lower()
311 if 'output_file' not in settings and 'output' not in settings:
312 print("Missing output file / directory")
313 sys.exit(1)
315 blkindex = get_block_hashes(settings)
316 blkmap = mkblockmap(blkindex)
318 # Block hash map won't be byte-reversed. Neither should the genesis hash.
319 if not settings['genesis'] in blkmap:
320 print("Genesis block not found in hashlist")
321 else:
322 BlockDataCopier(settings, blkindex, blkmap).run()