contrib/linearize/linearize-data.py

   1 #!/usr/bin/env python3
   2 #
   3 # linearize-data.py: Construct a linear, no-fork version of the chain.
   4 #
   5 # Copyright (c) 2013-2017 The Bitcoin Core developers
   6 # Distributed under the MIT software license, see the accompanying
   7 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
   8 #
   9
  10 from __future__ import print_function, division
  11 import struct
  12 import re
  13 import os
  14 import os.path
  15 import sys
  16 import hashlib
  17 import datetime
  18 import time
  19 from collections import namedtuple
  20 from binascii import hexlify, unhexlify
  21
  22 settings = {}
  23
  24 ##### Switch endian-ness #####
  25 def hex_switchEndian(s):
  26         """ Switches the endianness of a hex string (in pairs of hex chars) """
  27         pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)]
  28         return b''.join(pairList[::-1]).decode()
  29
  30 def uint32(x):
  31         return x & 0xffffffff
  32
  33 def bytereverse(x):
  34         return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
  35                        (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
  36
  37 def bufreverse(in_buf):
  38         out_words = []
  39         for i in range(0, len(in_buf), 4):
  40                 word = struct.unpack('@I', in_buf[i:i+4])[0]
  41                 out_words.append(struct.pack('@I', bytereverse(word)))
  42         return b''.join(out_words)
  43
  44 def wordreverse(in_buf):
  45         out_words = []
  46         for i in range(0, len(in_buf), 4):
  47                 out_words.append(in_buf[i:i+4])
  48         out_words.reverse()
  49         return b''.join(out_words)
  50
  51 def calc_hdr_hash(blk_hdr):
  52         hash1 = hashlib.sha256()
  53         hash1.update(blk_hdr)
  54         hash1_o = hash1.digest()
  55
  56         hash2 = hashlib.sha256()
  57         hash2.update(hash1_o)
  58         hash2_o = hash2.digest()
  59
  60         return hash2_o
  61
  62 def calc_hash_str(blk_hdr):
  63         hash = calc_hdr_hash(blk_hdr)
  64         hash = bufreverse(hash)
  65         hash = wordreverse(hash)
  66         hash_str = hexlify(hash).decode('utf-8')
  67         return hash_str
  68
  69 def get_blk_dt(blk_hdr):
  70         members = struct.unpack("<I", blk_hdr[68:68+4])
  71         nTime = members[0]
  72         dt = datetime.datetime.fromtimestamp(nTime)
  73         dt_ym = datetime.datetime(dt.year, dt.month, 1)
  74         return (dt_ym, nTime)
  75
  76 # When getting the list of block hashes, undo any byte reversals.
  77 def get_block_hashes(settings):
  78         blkindex = []
  79         f = open(settings['hashlist'], "r")
  80         for line in f:
  81                 line = line.rstrip()
  82                 if settings['rev_hash_bytes'] == 'true':
  83                         line = hex_switchEndian(line)
  84                 blkindex.append(line)
  85
  86         print("Read " + str(len(blkindex)) + " hashes")
  87
  88         return blkindex
  89
  90 # The block map shouldn't give or receive byte-reversed hashes.
  91 def mkblockmap(blkindex):
  92         blkmap = {}
  93         for height,hash in enumerate(blkindex):
  94                 blkmap[hash] = height
  95         return blkmap
  96
  97 # Block header and extent on disk
  98 BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
  99
 100 class BlockDataCopier:
 101         def __init__(self, settings, blkindex, blkmap):
 102                 self.settings = settings
 103                 self.blkindex = blkindex
 104                 self.blkmap = blkmap
 105
 106                 self.inFn = 0
 107                 self.inF = None
 108                 self.outFn = 0
 109                 self.outsz = 0
 110                 self.outF = None
 111                 self.outFname = None
 112                 self.blkCountIn = 0
 113                 self.blkCountOut = 0
 114
 115                 self.lastDate = datetime.datetime(2000, 1, 1)
 116                 self.highTS = 1408893517 - 315360000
 117                 self.timestampSplit = False
 118                 self.fileOutput = True
 119                 self.setFileTime = False
 120                 self.maxOutSz = settings['max_out_sz']
 121                 if 'output' in settings:
 122                         self.fileOutput = False
 123                 if settings['file_timestamp'] != 0:
 124                         self.setFileTime = True
 125                 if settings['split_timestamp'] != 0:
 126                         self.timestampSplit = True
 127                 # Extents and cache for out-of-order blocks
 128                 self.blockExtents = {}
 129                 self.outOfOrderData = {}
 130                 self.outOfOrderSize = 0 # running total size for items in outOfOrderData
 131
 132         def writeBlock(self, inhdr, blk_hdr, rawblock):
 133                 blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
 134                 if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
 135                         self.outF.close()
 136                         if self.setFileTime:
 137                                 os.utime(self.outFname, (int(time.time()), self.highTS))
 138                         self.outF = None
 139                         self.outFname = None
 140                         self.outFn = self.outFn + 1
 141                         self.outsz = 0
 142
 143                 (blkDate, blkTS) = get_blk_dt(blk_hdr)
 144                 if self.timestampSplit and (blkDate > self.lastDate):
 145                         print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
 146                         self.lastDate = blkDate
 147                         if self.outF:
 148                                 self.outF.close()
 149                                 if self.setFileTime:
 150                                         os.utime(self.outFname, (int(time.time()), self.highTS))
 151                                 self.outF = None
 152                                 self.outFname = None
 153                                 self.outFn = self.outFn + 1
 154                                 self.outsz = 0
 155
 156                 if not self.outF:
 157                         if self.fileOutput:
 158                                 self.outFname = self.settings['output_file']
 159                         else:
 160                                 self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn)
 161                         print("Output file " + self.outFname)
 162                         self.outF = open(self.outFname, "wb")
 163
 164                 self.outF.write(inhdr)
 165                 self.outF.write(blk_hdr)
 166                 self.outF.write(rawblock)
 167                 self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
 168
 169                 self.blkCountOut = self.blkCountOut + 1
 170                 if blkTS > self.highTS:
 171                         self.highTS = blkTS
 172
 173                 if (self.blkCountOut % 1000) == 0:
 174                         print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
 175                                         (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
 176
 177         def inFileName(self, fn):
 178                 return os.path.join(self.settings['input'], "blk%05d.dat" % fn)
 179
 180         def fetchBlock(self, extent):
 181                 '''Fetch block contents from disk given extents'''
 182                 with open(self.inFileName(extent.fn), "rb") as f:
 183                         f.seek(extent.offset)
 184                         return f.read(extent.size)
 185
 186         def copyOneBlock(self):
 187                 '''Find the next block to be written in the input, and copy it to the output.'''
 188                 extent = self.blockExtents.pop(self.blkCountOut)
 189                 if self.blkCountOut in self.outOfOrderData:
 190                         # If the data is cached, use it from memory and remove from the cache
 191                         rawblock = self.outOfOrderData.pop(self.blkCountOut)
 192                         self.outOfOrderSize -= len(rawblock)
 193                 else: # Otherwise look up data on disk
 194                         rawblock = self.fetchBlock(extent)
 195
 196                 self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
 197
 198         def run(self):
 199                 while self.blkCountOut < len(self.blkindex):
 200                         if not self.inF:
 201                                 fname = self.inFileName(self.inFn)
 202                                 print("Input file " + fname)
 203                                 try:
 204                                         self.inF = open(fname, "rb")
 205                                 except IOError:
 206                                         print("Premature end of block data")
 207                                         return
 208
 209                         inhdr = self.inF.read(8)
 210                         if (not inhdr or (inhdr[0] == "\0")):
 211                                 self.inF.close()
 212                                 self.inF = None
 213                                 self.inFn = self.inFn + 1
 214                                 continue
 215
 216                         inMagic = inhdr[:4]
 217                         if (inMagic != self.settings['netmagic']):
 218                                 print("Invalid magic: " + hexlify(inMagic).decode('utf-8'))
 219                                 return
 220                         inLenLE = inhdr[4:]
 221                         su = struct.unpack("<I", inLenLE)
 222                         inLen = su[0] - 80 # length without header
 223                         blk_hdr = self.inF.read(80)
 224                         inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
 225
 226                         self.hash_str = calc_hash_str(blk_hdr)
 227                         if not self.hash_str in blkmap:
 228                                 # Because blocks can be written to files out-of-order as of 0.10, the script
 229                                 # may encounter blocks it doesn't know about. Treat as debug output.
 230                                 if settings['debug_output'] == 'true':
 231                                         print("Skipping unknown block " + self.hash_str)
 232                                 self.inF.seek(inLen, os.SEEK_CUR)
 233                                 continue
 234
 235                         blkHeight = self.blkmap[self.hash_str]
 236                         self.blkCountIn += 1
 237
 238                         if self.blkCountOut == blkHeight:
 239                                 # If in-order block, just copy
 240                                 rawblock = self.inF.read(inLen)
 241                                 self.writeBlock(inhdr, blk_hdr, rawblock)
 242
 243                                 # See if we can catch up to prior out-of-order blocks
 244                                 while self.blkCountOut in self.blockExtents:
 245                                         self.copyOneBlock()
 246
 247                         else: # If out-of-order, skip over block data for now
 248                                 self.blockExtents[blkHeight] = inExtent
 249                                 if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
 250                                         # If there is space in the cache, read the data
 251                                         # Reading the data in file sequence instead of seeking and fetching it later is preferred,
 252                                         # but we don't want to fill up memory
 253                                         self.outOfOrderData[blkHeight] = self.inF.read(inLen)
 254                                         self.outOfOrderSize += inLen
 255                                 else: # If no space in cache, seek forward
 256                                         self.inF.seek(inLen, os.SEEK_CUR)
 257
 258                 print("Done (%i blocks written)" % (self.blkCountOut))
 259
 260 if __name__ == '__main__':
 261         if len(sys.argv) != 2:
 262                 print("Usage: linearize-data.py CONFIG-FILE")
 263                 sys.exit(1)
 264
 265         f = open(sys.argv[1])
 266         for line in f:
 267                 # skip comment lines
 268                 m = re.search('^\s*#', line)
 269                 if m:
 270                         continue
 271
 272                 # parse key=value lines
 273                 m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
 274                 if m is None:
 275                         continue
 276                 settings[m.group(1)] = m.group(2)
 277         f.close()
 278
 279         # Force hash byte format setting to be lowercase to make comparisons easier.
 280         # Also place upfront in case any settings need to know about it.
 281         if 'rev_hash_bytes' not in settings:
 282                 settings['rev_hash_bytes'] = 'false'
 283         settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
 284
 285         if 'netmagic' not in settings:
 286                 settings['netmagic'] = 'f9beb4d9'
 287         if 'genesis' not in settings:
 288                 settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
 289         if 'input' not in settings:
 290                 settings['input'] = 'input'
 291         if 'hashlist' not in settings:
 292                 settings['hashlist'] = 'hashlist.txt'
 293         if 'file_timestamp' not in settings:
 294                 settings['file_timestamp'] = 0
 295         if 'split_timestamp' not in settings:
 296                 settings['split_timestamp'] = 0
 297         if 'max_out_sz' not in settings:
 298                 settings['max_out_sz'] = 1000 * 1000 * 1000
 299         if 'out_of_order_cache_sz' not in settings:
 300                 settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
 301         if 'debug_output' not in settings:
 302                 settings['debug_output'] = 'false'
 303
 304         settings['max_out_sz'] = int(settings['max_out_sz'])
 305         settings['split_timestamp'] = int(settings['split_timestamp'])
 306         settings['file_timestamp'] = int(settings['file_timestamp'])
 307         settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8'))
 308         settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
 309         settings['debug_output'] = settings['debug_output'].lower()
 310
 311         if 'output_file' not in settings and 'output' not in settings:
 312                 print("Missing output file / directory")
 313                 sys.exit(1)
 314
 315         blkindex = get_block_hashes(settings)
 316         blkmap = mkblockmap(blkindex)
 317
 318         # Block hash map won't be byte-reversed. Neither should the genesis hash.
 319         if not settings['genesis'] in blkmap:
 320                 print("Genesis block not found in hashlist")
 321         else:
 322                 BlockDataCopier(settings, blkindex, blkmap).run()