3 # This software has been dedicated to the public domain under the CC0
4 # public domain dedication.
6 # To the extent possible under law, the person who associated CC0
7 # with mmdb-convert.py has waived all copyright and related or
8 # neighboring rights to mmdb-convert.py.
10 # You should have received a copy of the CC0 legalcode along with this
11 # work in doc/cc0.txt. If not, see
12 # <http://creativecommons.org/publicdomain/zero/1.0/>.
14 # Nick Mathewson is responsible for this kludge, but takes no
15 # responsibility for it.
17 """This kludge is meant to
18 parse mmdb files in sufficient detail to dump out the old format
19 that Tor expects. It's also meant to be pure-python.
21 When given a simplicity/speed tradeoff, it opts for simplicity.
23 You will not understand the code without understanding the MaxMind-DB
24 file format. It is specified at:
25 https://github.com/maxmind/MaxMind-DB/blob/master/MaxMind-DB-spec.md.
27 This isn't so much tested. When it breaks, you get to keep both
38 METADATA_MARKER
= b
'\xab\xcd\xefMaxMind.com'
40 # Here's some python2/python3 junk. Better solutions wanted.
45 "convert a single element of a bytestring to an integer."
50 # Here's some more python2/python3 junk. Better solutions wanted.
57 "convert a bytestring in utf8 to a string."
61 "Parse a big-endian integer from bytestring s."
65 result
+= byte_to_int(c
)
69 "Parse a pair of big-endian 24-bit integers from bytestring s."
70 a
, b
, c
= struct
.unpack("!HHH", s
)
71 return ((a
<<8)+(b
>>8)), (((b
&0xff)<<16)+c
)
74 "Parse a pair of big-endian 32-bit integers from bytestring s."
75 a
, b
= struct
.unpack("!LL", s
)
79 "Parse a pair of big-endian 28-bit integers from bytestring s."
80 a
, b
= unpack("!LL", s
+ b
'\x00')
81 return (((a
& 0xf0) << 20) + (a
>> 8)), ((a
& 0x0f) << 24) + (b
>> 8)
84 "Holds a node in the tree"
85 def __init__(self
, left
, right
):
89 def resolve_tree(tree
, data
):
90 """Fill in the left_item and right_item fields for all values in the tree
91 so that they point to another Tree, or to a Datum, or to None."""
92 d
= Datum(None, None, None, None)
93 def resolve_item(item
):
94 "Helper: resolve a single index."
97 elif item
== len(tree
):
100 d
.pos
= (item
- len(tree
) - 16)
101 p
= bisect
.bisect_left(data
, d
)
102 assert data
[p
].pos
== d
.pos
106 t
.left_item
= resolve_item(t
.left
)
107 t
.right_item
= resolve_item(t
.right
)
109 def parse_search_tree(s
, record_size
):
110 """Given a bytestring and a record size in bits, parse the tree.
111 Return a list of nodes."""
112 record_bytes
= (record_size
*2) // 8
116 to_leftright
= { 24: to_int24
,
118 32: to_int32
}[ record_size
]
120 raise NotImplementedError("Unsupported record size in bits: %d" %
123 left
, right
= to_leftright(s
[p
:p
+record_bytes
])
126 nodes
.append( Tree(left
, right
) )
131 """Holds a single entry from the Data section"""
132 def __init__(self
, pos
, kind
, ln
, data
):
133 self
.pos
= pos
# Position of this record within data section
134 self
.kind
= kind
# Type of this record. one of TP_*
135 self
.ln
= ln
# Length field, which might be overloaded.
136 self
.data
= data
# Raw bytes data.
137 self
.children
= None # Used for arrays and maps.
140 return "Datum(%r,%r,%r,%r)" % (self
.pos
, self
.kind
, self
.ln
, self
.data
)
142 # Comparison functions used for bsearch
143 def __lt__(self
, other
):
144 return self
.pos
< other
.pos
146 def __gt__(self
, other
):
147 return self
.pos
> other
.pos
149 def __eq__(self
, other
):
150 return self
.pos
== other
.pos
152 def build_maps(self
):
153 """If this is a map or array, fill in its 'map' field if it's a map,
154 and the 'map' field of all its children."""
156 if not hasattr(self
, 'nChildren'):
159 if self
.kind
== TP_ARRAY
:
161 for c
in self
.children
:
164 elif self
.kind
== TP_MAP
:
167 for i
in range(0, len(self
.children
), 2):
168 k
= self
.children
[i
].deref()
169 v
= self
.children
[i
+1].deref()
171 if k
.kind
!= TP_UTF8
:
172 raise ValueError("Bad dictionary key type %d"% k
.kind
)
173 self
.map[bytesToStr(k
.data
)] = v
176 """If this is an integer type, return its value"""
177 assert self
.kind
in (TP_UINT16
, TP_UINT32
, TP_UINT64
,
178 TP_UINT128
, TP_SINT32
)
179 i
= to_int(self
.data
)
180 if self
.kind
== TP_SINT32
:
186 """If this value is a pointer, return its pointed-to-value. Chase
187 through multiple layers of pointers if need be. If this isn't
188 a pointer, return it."""
191 while s
.kind
== TP_PTR
:
197 def resolve_pointers(data
):
198 """Fill in the ptr field of every pointer in data."""
199 search
= Datum(None, None, None, None)
203 p
= bisect
.bisect_left(data
, search
)
204 assert data
[p
].pos
== d
.ln
223 def get_type_and_len(s
):
224 """Data parsing helper: decode the type value and much-overloaded 'length'
225 field for the value starting at s. Return a 3-tuple of type, length,
226 and number of bytes used to encode type-plus-length."""
227 c
= byte_to_int(s
[0])
231 tp
= byte_to_int(s
[1])+7
235 # I'm sure I don't know what they were thinking here...
237 len_len
= (ln
>> 3) + 1
243 ln
+= to_int(s
[skip
:skip
+len_len
])
244 ln
+= (0, 0, 2048, 526336, 0)[len_len
]
248 ln
= to_int(s
[skip
:skip
+len_len
])
249 ln
+= (0, 29, 285, 65821)[len_len
]
254 # Set of types for which 'length' doesn't mean length.
255 IGNORE_LEN_TYPES
= set([
256 TP_MAP
, # Length is number of key-value pairs that follow.
257 TP_ARRAY
, # Length is number of members that follow.
258 TP_PTR
, # Length is index to pointed-to data element.
259 TP_BOOL
, # Length is 0 or 1.
260 TP_DCACHE
, # Length is number of members that follow
263 def parse_data_section(s
):
264 """Given a data section encoded in a bytestring, return a list of
267 # Stack of possibly nested containers. We use the 'nChildren' member of
268 # the last one to tell how many more items nest directly inside.
271 # List of all items, including nested ones.
274 # Byte index within the data section.
278 tp
, ln
, skip
= get_type_and_len(s
)
279 if tp
in IGNORE_LEN_TYPES
:
284 d
= Datum(pos
, tp
, ln
, s
[skip
:skip
+real_len
])
287 s
= s
[skip
+real_len
:]
290 stack
[-1].children
.append(d
)
291 stack
[-1].nChildren
-= 1
292 if stack
[-1].nChildren
== 0:
295 if d
.kind
== TP_ARRAY
:
299 elif d
.kind
== TP_MAP
:
300 d
.nChildren
= d
.ln
* 2
306 def parse_mm_file(s
):
307 """Parse a MaxMind-DB file."""
309 metadata_ptr
= s
.rindex(METADATA_MARKER
)
311 raise ValueError("No metadata!")
313 metadata
= parse_data_section(s
[metadata_ptr
+len(METADATA_MARKER
):])
315 if metadata
[0].kind
!= TP_MAP
:
316 raise ValueError("Bad map")
318 metadata
[0].build_maps()
321 tree_size
= (((mm
['record_size'].int_val() * 2) // 8 ) *
322 mm
['node_count'].int_val())
324 if s
[tree_size
:tree_size
+16] != b
'\x00'*16:
325 raise ValueError("Missing section separator!")
327 tree
= parse_search_tree(s
[:tree_size
], mm
['record_size'].int_val())
329 data
= parse_data_section(s
[tree_size
+16:metadata_ptr
])
331 resolve_pointers(data
)
332 resolve_tree(tree
, data
)
337 return metadata
, tree
, data
339 def format_datum(datum
):
340 """Given a Datum at a leaf of the tree, return the string that we should
343 We first try country->iso_code which is the two-character ISO 3166-1
344 country code of the country where MaxMind believes the end user is
345 located. If there's no such key, we try registered_country->iso_code
346 which is the country in which the ISP has registered the IP address.
347 Without falling back to registered_country, we'd leave out all ranges
348 that MaxMind thinks belong to anonymous proxies, because those ranges
349 don't contain country but only registered_country. In short: let's
350 fill all A1 entries with what ARIN et. al think.
353 return bytesToStr(datum
.map['country'].map['iso_code'].data
)
357 return bytesToStr(datum
.map['registered_country'].map['iso_code'].data
)
364 def dump_item_ipv4(entries
, prefix
, val
):
365 """Dump the information for an IPv4 address to entries, where 'prefix'
366 is a string holding a binary prefix for the address, and 'val' is the
367 value to dump. If the prefix is not an IPv4 address (it does not start
368 with 96 bits of 0), then print nothing.
370 if not prefix
.startswith(IPV4_PREFIX
):
374 shift
= 32 - len(prefix
)
376 hi
= ((v
+1) << shift
) - 1
377 entries
.append((lo
, hi
, val
))
379 def fmt_item_ipv4(entry
):
380 """Format an IPv4 range with lo and hi addresses in decimal form."""
381 return "%d,%d,%s\n"%(entry
[0], entry
[1], entry
[2])
383 def fmt_ipv6_addr(v
):
384 """Given a 128-bit integer representing an ipv6 address, return a
385 string for that ipv6 address."""
386 return socket
.inet_ntop(socket
.AF_INET6
, binascii
.unhexlify("%032x"%v
))
388 def fmt_item_ipv6(entry
):
389 """Format an IPv6 range with lo and hi addresses in hex form."""
390 return "%s,%s,%s\n"%(fmt_ipv6_addr(entry
[0]),
391 fmt_ipv6_addr(entry
[1]),
394 IPV4_MAPPED_IPV6_PREFIX
= "0"*80 + "1"*16
395 IPV6_6TO4_PREFIX
= "0010000000000010"
396 TEREDO_IPV6_PREFIX
= "0010000000000001" + "0"*16
398 def dump_item_ipv6(entries
, prefix
, val
):
399 """Dump the information for an IPv6 address prefix to entries, where
400 'prefix' is a string holding a binary prefix for the address,
401 and 'val' is the value to dump. If the prefix is an IPv4 address
402 (starts with 96 bits of 0), is an IPv4-mapped IPv6 address
403 (::ffff:0:0/96), or is in the 6to4 mapping subnet (2002::/16), then
406 if prefix
.startswith(IPV4_PREFIX
) or \
407 prefix
.startswith(IPV4_MAPPED_IPV6_PREFIX
) or \
408 prefix
.startswith(IPV6_6TO4_PREFIX
) or \
409 prefix
.startswith(TEREDO_IPV6_PREFIX
):
412 shift
= 128 - len(prefix
)
414 hi
= ((v
+1) << shift
) - 1
415 entries
.append((lo
, hi
, val
))
417 def dump_tree(entries
, node
, dump_item
, prefix
=""):
418 """Walk the tree rooted at 'node', and call dump_item on the
419 format_datum output of every leaf of the tree."""
421 if isinstance(node
, Tree
):
422 dump_tree(entries
, node
.left_item
, dump_item
, prefix
+"0")
423 dump_tree(entries
, node
.right_item
, dump_item
, prefix
+"1")
424 elif isinstance(node
, Datum
):
425 assert node
.kind
== TP_MAP
426 code
= format_datum(node
)
428 dump_item(entries
, prefix
, code
)
432 GEOIP_FILE_HEADER
= """\
433 # Last updated based on %s Maxmind GeoLite2 Country
434 # wget https://geolite.maxmind.com/download/geoip/database/GeoLite2-Country.mmdb.gz
435 # gunzip GeoLite2-Country.mmdb.gz
436 # python mmdb-convert.py GeoLite2-Country.mmdb
439 def write_geoip_file(filename
, metadata
, the_tree
, dump_item
, fmt_item
):
440 """Write the entries in the_tree to filename."""
442 dump_tree(entries
, the_tree
[0], dump_item
)
443 fobj
= open(filename
, 'w')
445 build_epoch
= metadata
[0].map['build_epoch'].int_val()
446 fobj
.write(GEOIP_FILE_HEADER
%
447 time
.strftime('%B %-d %Y', time
.gmtime(build_epoch
)))
450 for entry
in entries
:
453 elif unwritten
[1] + 1 == entry
[0] and unwritten
[2] == entry
[2]:
454 unwritten
= (unwritten
[0], entry
[1], unwritten
[2])
456 fobj
.write(fmt_item(unwritten
))
459 fobj
.write(fmt_item(unwritten
))
462 content
= open(sys
.argv
[1], 'rb').read()
463 metadata
, the_tree
, _
= parse_mm_file(content
)
465 write_geoip_file('geoip', metadata
, the_tree
, dump_item_ipv4
, fmt_item_ipv4
)
466 write_geoip_file('geoip6', metadata
, the_tree
, dump_item_ipv6
, fmt_item_ipv6
)