Merge branch 'tor-github/pr/415' into maint-0.3.5
[tor.git] / src / config / mmdb-convert.py
blob3a454a3fc1bb87b6c4850825f421ac0b0f2b32a3
1 #!/usr/bin/python3
3 # This software has been dedicated to the public domain under the CC0
4 # public domain dedication.
6 # To the extent possible under law, the person who associated CC0
7 # with mmdb-convert.py has waived all copyright and related or
8 # neighboring rights to mmdb-convert.py.
10 # You should have received a copy of the CC0 legalcode along with this
11 # work in doc/cc0.txt. If not, see
12 # <http://creativecommons.org/publicdomain/zero/1.0/>.
14 # Nick Mathewson is responsible for this kludge, but takes no
15 # responsibility for it.
17 """This kludge is meant to
18 parse mmdb files in sufficient detail to dump out the old format
19 that Tor expects. It's also meant to be pure-python.
21 When given a simplicity/speed tradeoff, it opts for simplicity.
23 You will not understand the code without understanding the MaxMind-DB
24 file format. It is specified at:
25 https://github.com/maxmind/MaxMind-DB/blob/master/MaxMind-DB-spec.md.
27 This isn't so much tested. When it breaks, you get to keep both
28 pieces.
29 """
31 import struct
32 import bisect
33 import socket
34 import binascii
35 import sys
36 import time
38 METADATA_MARKER = b'\xab\xcd\xefMaxMind.com'
40 # Here's some python2/python3 junk. Better solutions wanted.
41 try:
42 ord(b"1"[0])
43 except TypeError:
44 def byte_to_int(b):
45 "convert a single element of a bytestring to an integer."
46 return b
47 else:
48 byte_to_int = ord
50 # Here's some more python2/python3 junk. Better solutions wanted.
51 try:
52 str(b"a", "utf8")
53 except TypeError:
54 bytesToStr = str
55 else:
56 def bytesToStr(b):
57 "convert a bytestring in utf8 to a string."
58 return str(b, 'utf8')
60 def to_int(s):
61 "Parse a big-endian integer from bytestring s."
62 result = 0
63 for c in s:
64 result *= 256
65 result += byte_to_int(c)
66 return result
68 def to_int24(s):
69 "Parse a pair of big-endian 24-bit integers from bytestring s."
70 a, b, c = struct.unpack("!HHH", s)
71 return ((a <<8)+(b>>8)), (((b&0xff)<<16)+c)
73 def to_int32(s):
74 "Parse a pair of big-endian 32-bit integers from bytestring s."
75 a, b = struct.unpack("!LL", s)
76 return a, b
78 def to_int28(s):
79 "Parse a pair of big-endian 28-bit integers from bytestring s."
80 a, b = unpack("!LL", s + b'\x00')
81 return (((a & 0xf0) << 20) + (a >> 8)), ((a & 0x0f) << 24) + (b >> 8)
83 class Tree(object):
84 "Holds a node in the tree"
85 def __init__(self, left, right):
86 self.left = left
87 self.right = right
89 def resolve_tree(tree, data):
90 """Fill in the left_item and right_item fields for all values in the tree
91 so that they point to another Tree, or to a Datum, or to None."""
92 d = Datum(None, None, None, None)
93 def resolve_item(item):
94 "Helper: resolve a single index."
95 if item < len(tree):
96 return tree[item]
97 elif item == len(tree):
98 return None
99 else:
100 d.pos = (item - len(tree) - 16)
101 p = bisect.bisect_left(data, d)
102 assert data[p].pos == d.pos
103 return data[p]
105 for t in tree:
106 t.left_item = resolve_item(t.left)
107 t.right_item = resolve_item(t.right)
109 def parse_search_tree(s, record_size):
110 """Given a bytestring and a record size in bits, parse the tree.
111 Return a list of nodes."""
112 record_bytes = (record_size*2) // 8
113 nodes = []
114 p = 0
115 try:
116 to_leftright = { 24: to_int24,
117 28: to_int28,
118 32: to_int32 }[ record_size ]
119 except KeyError:
120 raise NotImplementedError("Unsupported record size in bits: %d" %
121 record_size)
122 while p < len(s):
123 left, right = to_leftright(s[p:p+record_bytes])
124 p += record_bytes
126 nodes.append( Tree(left, right ) )
128 return nodes
130 class Datum(object):
131 """Holds a single entry from the Data section"""
132 def __init__(self, pos, kind, ln, data):
133 self.pos = pos # Position of this record within data section
134 self.kind = kind # Type of this record. one of TP_*
135 self.ln = ln # Length field, which might be overloaded.
136 self.data = data # Raw bytes data.
137 self.children = None # Used for arrays and maps.
139 def __repr__(self):
140 return "Datum(%r,%r,%r,%r)" % (self.pos, self.kind, self.ln, self.data)
142 # Comparison functions used for bsearch
143 def __lt__(self, other):
144 return self.pos < other.pos
146 def __gt__(self, other):
147 return self.pos > other.pos
149 def __eq__(self, other):
150 return self.pos == other.pos
152 def build_maps(self):
153 """If this is a map or array, fill in its 'map' field if it's a map,
154 and the 'map' field of all its children."""
156 if not hasattr(self, 'nChildren'):
157 return
159 if self.kind == TP_ARRAY:
160 del self.nChildren
161 for c in self.children:
162 c.build_maps()
164 elif self.kind == TP_MAP:
165 del self.nChildren
166 self.map = {}
167 for i in range(0, len(self.children), 2):
168 k = self.children[i].deref()
169 v = self.children[i+1].deref()
170 v.build_maps()
171 if k.kind != TP_UTF8:
172 raise ValueError("Bad dictionary key type %d"% k.kind)
173 self.map[bytesToStr(k.data)] = v
175 def int_val(self):
176 """If this is an integer type, return its value"""
177 assert self.kind in (TP_UINT16, TP_UINT32, TP_UINT64,
178 TP_UINT128, TP_SINT32)
179 i = to_int(self.data)
180 if self.kind == TP_SINT32:
181 if i & 0x80000000:
182 i = i - 0x100000000
183 return i
185 def deref(self):
186 """If this value is a pointer, return its pointed-to-value. Chase
187 through multiple layers of pointers if need be. If this isn't
188 a pointer, return it."""
189 n = 0
190 s = self
191 while s.kind == TP_PTR:
192 s = s.ptr
193 n += 1
194 assert n < 100
195 return s
197 def resolve_pointers(data):
198 """Fill in the ptr field of every pointer in data."""
199 search = Datum(None, None, None, None)
200 for d in data:
201 if d.kind == TP_PTR:
202 search.pos = d.ln
203 p = bisect.bisect_left(data, search)
204 assert data[p].pos == d.ln
205 d.ptr = data[p]
207 TP_PTR = 1
208 TP_UTF8 = 2
209 TP_DBL = 3
210 TP_BYTES = 4
211 TP_UINT16 = 5
212 TP_UINT32 = 6
213 TP_MAP = 7
214 TP_SINT32 = 8
215 TP_UINT64 = 9
216 TP_UINT128 = 10
217 TP_ARRAY = 11
218 TP_DCACHE = 12
219 TP_END = 13
220 TP_BOOL = 14
221 TP_FLOAT = 15
223 def get_type_and_len(s):
224 """Data parsing helper: decode the type value and much-overloaded 'length'
225 field for the value starting at s. Return a 3-tuple of type, length,
226 and number of bytes used to encode type-plus-length."""
227 c = byte_to_int(s[0])
228 tp = c >> 5
229 skip = 1
230 if tp == 0:
231 tp = byte_to_int(s[1])+7
232 skip = 2
233 ln = c & 31
235 # I'm sure I don't know what they were thinking here...
236 if tp == TP_PTR:
237 len_len = (ln >> 3) + 1
238 if len_len < 4:
239 ln &= 7
240 ln <<= len_len * 8
241 else:
242 ln = 0
243 ln += to_int(s[skip:skip+len_len])
244 ln += (0, 0, 2048, 526336, 0)[len_len]
245 skip += len_len
246 elif ln >= 29:
247 len_len = ln - 28
248 ln = to_int(s[skip:skip+len_len])
249 ln += (0, 29, 285, 65821)[len_len]
250 skip += len_len
252 return tp, ln, skip
254 # Set of types for which 'length' doesn't mean length.
255 IGNORE_LEN_TYPES = set([
256 TP_MAP, # Length is number of key-value pairs that follow.
257 TP_ARRAY, # Length is number of members that follow.
258 TP_PTR, # Length is index to pointed-to data element.
259 TP_BOOL, # Length is 0 or 1.
260 TP_DCACHE, # Length is number of members that follow
263 def parse_data_section(s):
264 """Given a data section encoded in a bytestring, return a list of
265 Datum items."""
267 # Stack of possibly nested containers. We use the 'nChildren' member of
268 # the last one to tell how many more items nest directly inside.
269 stack = []
271 # List of all items, including nested ones.
272 data = []
274 # Byte index within the data section.
275 pos = 0
277 while s:
278 tp, ln, skip = get_type_and_len(s)
279 if tp in IGNORE_LEN_TYPES:
280 real_len = 0
281 else:
282 real_len = ln
284 d = Datum(pos, tp, ln, s[skip:skip+real_len])
285 data.append(d)
286 pos += skip+real_len
287 s = s[skip+real_len:]
289 if stack:
290 stack[-1].children.append(d)
291 stack[-1].nChildren -= 1
292 if stack[-1].nChildren == 0:
293 del stack[-1]
295 if d.kind == TP_ARRAY:
296 d.nChildren = d.ln
297 d.children = []
298 stack.append(d)
299 elif d.kind == TP_MAP:
300 d.nChildren = d.ln * 2
301 d.children = []
302 stack.append(d)
304 return data
306 def parse_mm_file(s):
307 """Parse a MaxMind-DB file."""
308 try:
309 metadata_ptr = s.rindex(METADATA_MARKER)
310 except ValueError:
311 raise ValueError("No metadata!")
313 metadata = parse_data_section(s[metadata_ptr+len(METADATA_MARKER):])
315 if metadata[0].kind != TP_MAP:
316 raise ValueError("Bad map")
318 metadata[0].build_maps()
319 mm = metadata[0].map
321 tree_size = (((mm['record_size'].int_val() * 2) // 8 ) *
322 mm['node_count'].int_val())
324 if s[tree_size:tree_size+16] != b'\x00'*16:
325 raise ValueError("Missing section separator!")
327 tree = parse_search_tree(s[:tree_size], mm['record_size'].int_val())
329 data = parse_data_section(s[tree_size+16:metadata_ptr])
331 resolve_pointers(data)
332 resolve_tree(tree, data)
334 for d in data:
335 d.build_maps()
337 return metadata, tree, data
339 def format_datum(datum):
340 """Given a Datum at a leaf of the tree, return the string that we should
341 write as its value.
343 We first try country->iso_code which is the two-character ISO 3166-1
344 country code of the country where MaxMind believes the end user is
345 located. If there's no such key, we try registered_country->iso_code
346 which is the country in which the ISP has registered the IP address.
347 Without falling back to registered_country, we'd leave out all ranges
348 that MaxMind thinks belong to anonymous proxies, because those ranges
349 don't contain country but only registered_country. In short: let's
350 fill all A1 entries with what ARIN et. al think.
352 try:
353 return bytesToStr(datum.map['country'].map['iso_code'].data)
354 except KeyError:
355 pass
356 try:
357 return bytesToStr(datum.map['registered_country'].map['iso_code'].data)
358 except KeyError:
359 pass
360 return None
362 IPV4_PREFIX = "0"*96
364 def dump_item_ipv4(entries, prefix, val):
365 """Dump the information for an IPv4 address to entries, where 'prefix'
366 is a string holding a binary prefix for the address, and 'val' is the
367 value to dump. If the prefix is not an IPv4 address (it does not start
368 with 96 bits of 0), then print nothing.
370 if not prefix.startswith(IPV4_PREFIX):
371 return
372 prefix = prefix[96:]
373 v = int(prefix, 2)
374 shift = 32 - len(prefix)
375 lo = v << shift
376 hi = ((v+1) << shift) - 1
377 entries.append((lo, hi, val))
379 def fmt_item_ipv4(entry):
380 """Format an IPv4 range with lo and hi addresses in decimal form."""
381 return "%d,%d,%s\n"%(entry[0], entry[1], entry[2])
383 def fmt_ipv6_addr(v):
384 """Given a 128-bit integer representing an ipv6 address, return a
385 string for that ipv6 address."""
386 return socket.inet_ntop(socket.AF_INET6, binascii.unhexlify("%032x"%v))
388 def fmt_item_ipv6(entry):
389 """Format an IPv6 range with lo and hi addresses in hex form."""
390 return "%s,%s,%s\n"%(fmt_ipv6_addr(entry[0]),
391 fmt_ipv6_addr(entry[1]),
392 entry[2])
394 IPV4_MAPPED_IPV6_PREFIX = "0"*80 + "1"*16
395 IPV6_6TO4_PREFIX = "0010000000000010"
396 TEREDO_IPV6_PREFIX = "0010000000000001" + "0"*16
398 def dump_item_ipv6(entries, prefix, val):
399 """Dump the information for an IPv6 address prefix to entries, where
400 'prefix' is a string holding a binary prefix for the address,
401 and 'val' is the value to dump. If the prefix is an IPv4 address
402 (starts with 96 bits of 0), is an IPv4-mapped IPv6 address
403 (::ffff:0:0/96), or is in the 6to4 mapping subnet (2002::/16), then
404 print nothing.
406 if prefix.startswith(IPV4_PREFIX) or \
407 prefix.startswith(IPV4_MAPPED_IPV6_PREFIX) or \
408 prefix.startswith(IPV6_6TO4_PREFIX) or \
409 prefix.startswith(TEREDO_IPV6_PREFIX):
410 return
411 v = int(prefix, 2)
412 shift = 128 - len(prefix)
413 lo = v << shift
414 hi = ((v+1) << shift) - 1
415 entries.append((lo, hi, val))
417 def dump_tree(entries, node, dump_item, prefix=""):
418 """Walk the tree rooted at 'node', and call dump_item on the
419 format_datum output of every leaf of the tree."""
421 if isinstance(node, Tree):
422 dump_tree(entries, node.left_item, dump_item, prefix+"0")
423 dump_tree(entries, node.right_item, dump_item, prefix+"1")
424 elif isinstance(node, Datum):
425 assert node.kind == TP_MAP
426 code = format_datum(node)
427 if code:
428 dump_item(entries, prefix, code)
429 else:
430 assert node == None
432 GEOIP_FILE_HEADER = """\
433 # Last updated based on %s Maxmind GeoLite2 Country
434 # wget https://geolite.maxmind.com/download/geoip/database/GeoLite2-Country.mmdb.gz
435 # gunzip GeoLite2-Country.mmdb.gz
436 # python mmdb-convert.py GeoLite2-Country.mmdb
439 def write_geoip_file(filename, metadata, the_tree, dump_item, fmt_item):
440 """Write the entries in the_tree to filename."""
441 entries = []
442 dump_tree(entries, the_tree[0], dump_item)
443 fobj = open(filename, 'w')
445 build_epoch = metadata[0].map['build_epoch'].int_val()
446 fobj.write(GEOIP_FILE_HEADER %
447 time.strftime('%B %-d %Y', time.gmtime(build_epoch)))
449 unwritten = None
450 for entry in entries:
451 if not unwritten:
452 unwritten = entry
453 elif unwritten[1] + 1 == entry[0] and unwritten[2] == entry[2]:
454 unwritten = (unwritten[0], entry[1], unwritten[2])
455 else:
456 fobj.write(fmt_item(unwritten))
457 unwritten = entry
458 if unwritten:
459 fobj.write(fmt_item(unwritten))
460 fobj.close()
462 content = open(sys.argv[1], 'rb').read()
463 metadata, the_tree, _ = parse_mm_file(content)
465 write_geoip_file('geoip', metadata, the_tree, dump_item_ipv4, fmt_item_ipv4)
466 write_geoip_file('geoip6', metadata, the_tree, dump_item_ipv6, fmt_item_ipv6)