src/config/mmdb-convert.py

   1 #!/usr/bin/python3
   2
   3 #   This software has been dedicated to the public domain under the CC0
   4 #   public domain dedication.
   5 #
   6 #   To the extent possible under law, the person who associated CC0
   7 #   with mmdb-convert.py has waived all copyright and related or
   8 #   neighboring rights to mmdb-convert.py.
   9 #
  10 #   You should have received a copy of the CC0 legalcode along with this
  11 #   work in doc/cc0.txt.  If not, see
  12 #      <http://creativecommons.org/publicdomain/zero/1.0/>.
  13
  14 #  Nick Mathewson is responsible for this kludge, but takes no
  15 #  responsibility for it.
  16
  17 """This kludge is meant to
  18    parse mmdb files in sufficient detail to dump out the old format
  19    that Tor expects.  It's also meant to be pure-python.
  20
  21    When given a simplicity/speed tradeoff, it opts for simplicity.
  22
  23    You will not understand the code without understanding the MaxMind-DB
  24    file format.  It is specified at:
  25    https://github.com/maxmind/MaxMind-DB/blob/master/MaxMind-DB-spec.md.
  26
  27    This isn't so much tested.  When it breaks, you get to keep both
  28    pieces.
  29 """
  30
  31 import struct
  32 import bisect
  33 import socket
  34 import binascii
  35 import sys
  36 import time
  37
  38 METADATA_MARKER = b'\xab\xcd\xefMaxMind.com'
  39
  40 # Here's some python2/python3 junk.  Better solutions wanted.
  41 try:
  42     ord(b"1"[0])
  43 except TypeError:
  44     def byte_to_int(b):
  45         "convert a single element of a bytestring to an integer."
  46         return b
  47 else:
  48     byte_to_int = ord
  49
  50 # Here's some more python2/python3 junk.  Better solutions wanted.
  51 try:
  52     str(b"a", "utf8")
  53 except TypeError:
  54     bytesToStr = str
  55 else:
  56     def bytesToStr(b):
  57         "convert a bytestring in utf8 to a string."
  58         return str(b, 'utf8')
  59
  60 def to_int(s):
  61     "Parse a big-endian integer from bytestring s."
  62     result = 0
  63     for c in s:
  64         result *= 256
  65         result += byte_to_int(c)
  66     return result
  67
  68 def to_int24(s):
  69     "Parse a pair of big-endian 24-bit integers from bytestring s."
  70     a, b, c = struct.unpack("!HHH", s)
  71     return ((a <<8)+(b>>8)), (((b&0xff)<<16)+c)
  72
  73 def to_int32(s):
  74     "Parse a pair of big-endian 32-bit integers from bytestring s."
  75     a, b = struct.unpack("!LL", s)
  76     return a, b
  77
  78 def to_int28(s):
  79     "Parse a pair of big-endian 28-bit integers from bytestring s."
  80     a, b = unpack("!LL", s + b'\x00')
  81     return (((a & 0xf0) << 20) + (a >> 8)), ((a & 0x0f) << 24) + (b >> 8)
  82
  83 class Tree(object):
  84     "Holds a node in the tree"
  85     def __init__(self, left, right):
  86         self.left = left
  87         self.right = right
  88
  89 def resolve_tree(tree, data):
  90     """Fill in the left_item and right_item fields for all values in the tree
  91        so that they point to another Tree, or to a Datum, or to None."""
  92     d = Datum(None, None, None, None)
  93     def resolve_item(item):
  94         "Helper: resolve a single index."
  95         if item < len(tree):
  96             return tree[item]
  97         elif item == len(tree):
  98             return None
  99         else:
 100             d.pos = (item - len(tree) - 16)
 101             p = bisect.bisect_left(data, d)
 102             assert data[p].pos == d.pos
 103             return data[p]
 104
 105     for t in tree:
 106         t.left_item = resolve_item(t.left)
 107         t.right_item = resolve_item(t.right)
 108
 109 def parse_search_tree(s, record_size):
 110     """Given a bytestring and a record size in bits, parse the tree.
 111        Return a list of nodes."""
 112     record_bytes = (record_size*2) // 8
 113     nodes = []
 114     p = 0
 115     try:
 116         to_leftright = { 24: to_int24,
 117                          28: to_int28,
 118                          32: to_int32 }[ record_size ]
 119     except KeyError:
 120         raise NotImplementedError("Unsupported record size in bits: %d" %
 121                                   record_size)
 122     while p < len(s):
 123         left, right = to_leftright(s[p:p+record_bytes])
 124         p += record_bytes
 125
 126         nodes.append( Tree(left, right ) )
 127
 128     return nodes
 129
 130 class Datum(object):
 131     """Holds a single entry from the Data section"""
 132     def __init__(self, pos, kind, ln, data):
 133         self.pos = pos    # Position of this record within data section
 134         self.kind = kind  # Type of this record. one of TP_*
 135         self.ln = ln      # Length field, which might be overloaded.
 136         self.data = data  # Raw bytes data.
 137         self.children = None # Used for arrays and maps.
 138
 139     def __repr__(self):
 140         return "Datum(%r,%r,%r,%r)" % (self.pos, self.kind, self.ln, self.data)
 141
 142     # Comparison functions used for bsearch
 143     def __lt__(self, other):
 144         return self.pos < other.pos
 145
 146     def __gt__(self, other):
 147         return self.pos > other.pos
 148
 149     def __eq__(self, other):
 150         return self.pos == other.pos
 151
 152     def build_maps(self):
 153         """If this is a map or array, fill in its 'map' field if it's a map,
 154            and the 'map' field of all its children."""
 155
 156         if not hasattr(self, 'nChildren'):
 157             return
 158
 159         if self.kind == TP_ARRAY:
 160             del self.nChildren
 161             for c in self.children:
 162                 c.build_maps()
 163
 164         elif self.kind == TP_MAP:
 165             del self.nChildren
 166             self.map = {}
 167             for i in range(0, len(self.children), 2):
 168                 k = self.children[i].deref()
 169                 v = self.children[i+1].deref()
 170                 v.build_maps()
 171                 if k.kind != TP_UTF8:
 172                     raise ValueError("Bad dictionary key type %d"% k.kind)
 173                 self.map[bytesToStr(k.data)] = v
 174
 175     def int_val(self):
 176         """If this is an integer type, return its value"""
 177         assert self.kind in (TP_UINT16, TP_UINT32, TP_UINT64,
 178                              TP_UINT128, TP_SINT32)
 179         i = to_int(self.data)
 180         if self.kind == TP_SINT32:
 181             if i & 0x80000000:
 182                 i = i - 0x100000000
 183         return i
 184
 185     def deref(self):
 186         """If this value is a pointer, return its pointed-to-value.  Chase
 187            through multiple layers of pointers if need be.  If this isn't
 188            a pointer, return it."""
 189         n = 0
 190         s = self
 191         while s.kind == TP_PTR:
 192             s = s.ptr
 193             n += 1
 194             assert n < 100
 195         return s
 196
 197 def resolve_pointers(data):
 198     """Fill in the ptr field of every pointer in data."""
 199     search = Datum(None, None, None, None)
 200     for d in data:
 201         if d.kind == TP_PTR:
 202             search.pos = d.ln
 203             p = bisect.bisect_left(data, search)
 204             assert data[p].pos == d.ln
 205             d.ptr = data[p]
 206
 207 TP_PTR = 1
 208 TP_UTF8 = 2
 209 TP_DBL = 3
 210 TP_BYTES = 4
 211 TP_UINT16 = 5
 212 TP_UINT32 = 6
 213 TP_MAP = 7
 214 TP_SINT32 = 8
 215 TP_UINT64 = 9
 216 TP_UINT128 = 10
 217 TP_ARRAY = 11
 218 TP_DCACHE = 12
 219 TP_END = 13
 220 TP_BOOL = 14
 221 TP_FLOAT = 15
 222
 223 def get_type_and_len(s):
 224     """Data parsing helper: decode the type value and much-overloaded 'length'
 225        field for the value starting at s.  Return a 3-tuple of type, length,
 226        and number of bytes used to encode type-plus-length."""
 227     c = byte_to_int(s[0])
 228     tp = c >> 5
 229     skip = 1
 230     if tp == 0:
 231         tp = byte_to_int(s[1])+7
 232         skip = 2
 233     ln = c & 31
 234
 235     # I'm sure I don't know what they were thinking here...
 236     if tp == TP_PTR:
 237         len_len = (ln >> 3) + 1
 238         if len_len < 4:
 239             ln &= 7
 240             ln <<= len_len * 8
 241         else:
 242             ln = 0
 243         ln += to_int(s[skip:skip+len_len])
 244         ln += (0, 0, 2048, 526336, 0)[len_len]
 245         skip += len_len
 246     elif ln >= 29:
 247         len_len = ln - 28
 248         ln = to_int(s[skip:skip+len_len])
 249         ln += (0, 29, 285, 65821)[len_len]
 250         skip += len_len
 251
 252     return tp, ln, skip
 253
 254 # Set of types for which 'length' doesn't mean length.
 255 IGNORE_LEN_TYPES = set([
 256     TP_MAP,    # Length is number of key-value pairs that follow.
 257     TP_ARRAY,  # Length is number of members that follow.
 258     TP_PTR,    # Length is index to pointed-to data element.
 259     TP_BOOL,   # Length is 0 or 1.
 260     TP_DCACHE, # Length is number of members that follow
 261 ])
 262
 263 def parse_data_section(s):
 264     """Given a data section encoded in a bytestring, return a list of
 265        Datum items."""
 266
 267     # Stack of possibly nested containers.  We use the 'nChildren' member of
 268     # the last one to tell how many more items nest directly inside.
 269     stack = []
 270
 271     # List of all items, including nested ones.
 272     data = []
 273
 274     # Byte index within the data section.
 275     pos = 0
 276
 277     while s:
 278         tp, ln, skip = get_type_and_len(s)
 279         if tp in IGNORE_LEN_TYPES:
 280             real_len = 0
 281         else:
 282             real_len = ln
 283
 284         d = Datum(pos, tp, ln, s[skip:skip+real_len])
 285         data.append(d)
 286         pos += skip+real_len
 287         s = s[skip+real_len:]
 288
 289         if stack:
 290             stack[-1].children.append(d)
 291             stack[-1].nChildren -= 1
 292             if stack[-1].nChildren == 0:
 293                 del stack[-1]
 294
 295         if d.kind == TP_ARRAY:
 296             d.nChildren = d.ln
 297             d.children = []
 298             stack.append(d)
 299         elif d.kind == TP_MAP:
 300             d.nChildren = d.ln * 2
 301             d.children = []
 302             stack.append(d)
 303
 304     return data
 305
 306 def parse_mm_file(s):
 307     """Parse a MaxMind-DB file."""
 308     try:
 309         metadata_ptr = s.rindex(METADATA_MARKER)
 310     except ValueError:
 311         raise ValueError("No metadata!")
 312
 313     metadata = parse_data_section(s[metadata_ptr+len(METADATA_MARKER):])
 314
 315     if metadata[0].kind != TP_MAP:
 316         raise ValueError("Bad map")
 317
 318     metadata[0].build_maps()
 319     mm = metadata[0].map
 320
 321     tree_size = (((mm['record_size'].int_val() * 2) // 8 ) *
 322                  mm['node_count'].int_val())
 323
 324     if s[tree_size:tree_size+16] != b'\x00'*16:
 325         raise ValueError("Missing section separator!")
 326
 327     tree = parse_search_tree(s[:tree_size], mm['record_size'].int_val())
 328
 329     data = parse_data_section(s[tree_size+16:metadata_ptr])
 330
 331     resolve_pointers(data)
 332     resolve_tree(tree, data)
 333
 334     for d in data:
 335         d.build_maps()
 336
 337     return metadata, tree, data
 338
 339 def format_datum(datum):
 340     """Given a Datum at a leaf of the tree, return the string that we should
 341        write as its value.
 342
 343        We first try country->iso_code which is the two-character ISO 3166-1
 344        country code of the country where MaxMind believes the end user is
 345        located.  If there's no such key, we try registered_country->iso_code
 346        which is the country in which the ISP has registered the IP address.
 347        Without falling back to registered_country, we'd leave out all ranges
 348        that MaxMind thinks belong to anonymous proxies, because those ranges
 349        don't contain country but only registered_country.  In short: let's
 350        fill all A1 entries with what ARIN et. al think.
 351     """
 352     try:
 353         return bytesToStr(datum.map['country'].map['iso_code'].data)
 354     except KeyError:
 355         pass
 356     try:
 357         return bytesToStr(datum.map['registered_country'].map['iso_code'].data)
 358     except KeyError:
 359         pass
 360     return None
 361
 362 IPV4_PREFIX = "0"*96
 363
 364 def dump_item_ipv4(entries, prefix, val):
 365     """Dump the information for an IPv4 address to entries, where 'prefix'
 366        is a string holding a binary prefix for the address, and 'val' is the
 367        value to dump.  If the prefix is not an IPv4 address (it does not start
 368        with 96 bits of 0), then print nothing.
 369     """
 370     if not prefix.startswith(IPV4_PREFIX):
 371         return
 372     prefix = prefix[96:]
 373     v = int(prefix, 2)
 374     shift = 32 - len(prefix)
 375     lo = v << shift
 376     hi = ((v+1) << shift) - 1
 377     entries.append((lo, hi, val))
 378
 379 def fmt_item_ipv4(entry):
 380     """Format an IPv4 range with lo and hi addresses in decimal form."""
 381     return "%d,%d,%s\n"%(entry[0], entry[1], entry[2])
 382
 383 def fmt_ipv6_addr(v):
 384     """Given a 128-bit integer representing an ipv6 address, return a
 385        string for that ipv6 address."""
 386     return socket.inet_ntop(socket.AF_INET6, binascii.unhexlify("%032x"%v))
 387
 388 def fmt_item_ipv6(entry):
 389     """Format an IPv6 range with lo and hi addresses in hex form."""
 390     return "%s,%s,%s\n"%(fmt_ipv6_addr(entry[0]),
 391                          fmt_ipv6_addr(entry[1]),
 392                          entry[2])
 393
 394 IPV4_MAPPED_IPV6_PREFIX = "0"*80 + "1"*16
 395 IPV6_6TO4_PREFIX = "0010000000000010"
 396 TEREDO_IPV6_PREFIX = "0010000000000001" + "0"*16
 397
 398 def dump_item_ipv6(entries, prefix, val):
 399     """Dump the information for an IPv6 address prefix to entries, where
 400        'prefix' is a string holding a binary prefix for the address,
 401        and 'val' is the value to dump.  If the prefix is an IPv4 address
 402        (starts with 96 bits of 0), is an IPv4-mapped IPv6 address
 403        (::ffff:0:0/96), or is in the 6to4 mapping subnet (2002::/16), then
 404        print nothing.
 405     """
 406     if prefix.startswith(IPV4_PREFIX) or \
 407        prefix.startswith(IPV4_MAPPED_IPV6_PREFIX) or \
 408        prefix.startswith(IPV6_6TO4_PREFIX) or \
 409        prefix.startswith(TEREDO_IPV6_PREFIX):
 410         return
 411     v = int(prefix, 2)
 412     shift = 128 - len(prefix)
 413     lo = v << shift
 414     hi = ((v+1) << shift) - 1
 415     entries.append((lo, hi, val))
 416
 417 def dump_tree(entries, node, dump_item, prefix=""):
 418     """Walk the tree rooted at 'node', and call dump_item on the
 419        format_datum output of every leaf of the tree."""
 420
 421     if isinstance(node, Tree):
 422         dump_tree(entries, node.left_item, dump_item, prefix+"0")
 423         dump_tree(entries, node.right_item, dump_item, prefix+"1")
 424     elif isinstance(node, Datum):
 425         assert node.kind == TP_MAP
 426         code = format_datum(node)
 427         if code:
 428             dump_item(entries, prefix, code)
 429     else:
 430         assert node == None
 431
 432 GEOIP_FILE_HEADER = """\
 433 # Last updated based on %s Maxmind GeoLite2 Country
 434 # wget https://geolite.maxmind.com/download/geoip/database/GeoLite2-Country.mmdb.gz
 435 # gunzip GeoLite2-Country.mmdb.gz
 436 # python mmdb-convert.py GeoLite2-Country.mmdb
 437 """
 438
 439 def write_geoip_file(filename, metadata, the_tree, dump_item, fmt_item):
 440     """Write the entries in the_tree to filename."""
 441     entries = []
 442     dump_tree(entries, the_tree[0], dump_item)
 443     fobj = open(filename, 'w')
 444
 445     build_epoch = metadata[0].map['build_epoch'].int_val()
 446     fobj.write(GEOIP_FILE_HEADER %
 447                time.strftime('%B %-d %Y', time.gmtime(build_epoch)))
 448
 449     unwritten = None
 450     for entry in entries:
 451         if not unwritten:
 452             unwritten = entry
 453         elif unwritten[1] + 1 == entry[0] and unwritten[2] == entry[2]:
 454             unwritten = (unwritten[0], entry[1], unwritten[2])
 455         else:
 456             fobj.write(fmt_item(unwritten))
 457             unwritten = entry
 458     if unwritten:
 459         fobj.write(fmt_item(unwritten))
 460     fobj.close()
 461
 462 content = open(sys.argv[1], 'rb').read()
 463 metadata, the_tree, _ = parse_mm_file(content)
 464
 465 write_geoip_file('geoip', metadata, the_tree, dump_item_ipv4, fmt_item_ipv4)
 466 write_geoip_file('geoip6', metadata, the_tree, dump_item_ipv6, fmt_item_ipv6)