2 # Copyright 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
7 A Deterministic acyclic finite state automaton (DAFSA) is a compact
8 representation of an unordered word list (dictionary).
10 http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
12 This python program converts a list of strings to a byte array in C++.
13 This python program fetches strings and return values from a gperf file
14 and generates a C++ file with a byte array representing graph that can be
15 used as a memory efficient replacement for the perfect hash table.
17 The input strings are assumed to consist of printable 7-bit ASCII characters
18 and the return values are assumed to be one digit integers.
20 In this program a DAFSA is a diamond shaped graph starting at a common
21 root node and ending at a common end node. All internal nodes contain
22 a character and each word is represented by the characters in one path from
23 the root node to the end node.
25 The order of the operations is crucial since lookups will be performed
26 starting from the source with no backtracking. Thus a node must have at
27 most one child with a label starting by the same character. The output
28 is also arranged so that all jumps are to increasing addresses, thus forward
31 The generated output has suffix free decoding so that the sign of leading
32 bits in a link (a reference to a child node) indicate if it has a size of one,
33 two or three bytes and if it is the last outgoing link from the actual node.
34 A node label is terminated by a byte with the leading bit set.
36 The generated byte array can described by the following BNF:
38 <byte> ::= < 8-bit value in range [0x00-0xFF] >
40 <char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
41 <end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
42 <return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
44 <offset1> ::= < byte in range [0x00-0x3F] >
45 <offset2> ::= < byte in range [0x40-0x5F] >
46 <offset3> ::= < byte in range [0x60-0x7F] >
48 <end_offset1> ::= < byte in range [0x80-0xBF] >
49 <end_offset2> ::= < byte in range [0xC0-0xDF] >
50 <end_offset3> ::= < byte in range [0xE0-0xFF] >
54 <label> ::= <end_char>
57 <end_label> ::= <return_value>
60 <offset> ::= <offset1>
62 | <offset3> <byte> <byte>
64 <end_offset> ::= <end_offset1>
65 | <end_offset2> <byte>
66 | <end_offset3> <byte> <byte>
68 <offsets> ::= <end_offset>
71 <source> ::= <offsets>
73 <node> ::= <label> <offsets>
82 <char> -> printable 7-bit ASCII character
83 <end_char> & 0x7F -> printable 7-bit ASCII character
84 <return value> & 0x0F -> integer
85 <offset1 & 0x3F> -> integer
86 ((<offset2> & 0x1F>) << 8) + <byte> -> integer
87 ((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
89 end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
90 offset2 and offset3 respectively.
92 The first offset in a list of offsets is the distance in bytes between the
93 offset itself and the first child node. Subsequent offsets are the distance
94 between previous child node and next child node. Thus each offset links a node
95 to a child node. The distance is always counted between start addresses, i.e.
96 first byte in decoded offset or first byte in child node.
105 The input is first parsed to a list of words:
108 This produces the following graph:
109 [root] --- a --- 0x02 --- [end]
114 A C++ representation of the compressed graph is generated:
116 const unsigned char dafsa[7] = {
117 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
120 The bytes in the generated array has the following meaning:
122 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
124 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
125 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
127 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
128 4: 0x82 <return_value> 0x82 & 0x0F -> return 2
130 5: 0x61 <char> label character 0x61 -> match "a"
131 6: 0x81 <return_value> 0x81 & 0x0F -> return 1
141 The input is first parsed to a list of words:
142 ["aa1", "bbb2", "baa1"]
144 This produces the following graph:
145 [root] --- a --- a --- 0x01 --- [end]
148 - b --- b --- b --- 0x02
150 A C++ representation of the compressed graph is generated:
152 const unsigned char dafsa[11] = {
153 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
156 The bytes in the generated array has the following meaning:
158 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
159 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
161 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
162 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
163 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
165 5: 0x61 <char> label character 0x61 -> match "a"
166 6: 0x61 <char> label character 0x61 -> match "a"
167 7: 0x81 <return_value> 0x81 & 0x0F -> return 1
169 8: 0x62 <char> label character 0x62 -> match "b"
170 9: 0x62 <char> label character 0x62 -> match "b"
171 10: 0x82 <return_value> 0x82 & 0x0F -> return 2
176 from incremental_dafsa
import Dafsa
, Node
179 class InputError(Exception):
180 """Exception raised for errors in the input file."""
183 def top_sort(dafsa
: Dafsa
):
184 """Generates list of nodes in topological sort order."""
187 def count_incoming(node
: Node
):
188 """Counts incoming references."""
189 if not node
.is_end_node
:
190 if id(node
) not in incoming
:
191 incoming
[id(node
)] = 1
192 for child
in node
.children
.values():
193 count_incoming(child
)
195 incoming
[id(node
)] += 1
197 for node
in dafsa
.root_node
.children
.values():
200 for node
in dafsa
.root_node
.children
.values():
201 incoming
[id(node
)] -= 1
204 node
for node
in dafsa
.root_node
.children
.values() if incoming
[id(node
)] == 0
210 assert incoming
[id(node
)] == 0
212 for child
in node
.children
.values():
213 if not child
.is_end_node
:
214 incoming
[id(child
)] -= 1
215 if incoming
[id(child
)] == 0:
216 waiting
.append(child
)
220 def encode_links(node
: Node
, offsets
, current
):
221 """Encodes a list of children as one, two or three byte offsets."""
222 if next(iter(node
.children
.values())).is_end_node
:
223 # This is an <end_label> node and no links follow such nodes
225 guess
= 3 * len(node
.children
)
228 children
= sorted(node
.children
.values(), key
=lambda x
: -offsets
[id(x
)])
230 offset
= current
+ guess
232 for child
in children
:
234 distance
= offset
- offsets
[id(child
)]
235 assert distance
> 0 and distance
< (1 << 21)
237 if distance
< (1 << 6):
238 # A 6-bit offset: "s0xxxxxx"
240 elif distance
< (1 << 13):
241 # A 13-bit offset: "s10xxxxxxxxxxxxx"
242 buf
.append(0x40 |
(distance
>> 8))
243 buf
.append(distance
& 0xFF)
245 # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
246 buf
.append(0x60 |
(distance
>> 16))
247 buf
.append((distance
>> 8) & 0xFF)
248 buf
.append(distance
& 0xFF)
249 # Distance in first link is relative to following record.
250 # Distance in other links are relative to previous link.
252 if len(buf
) == guess
:
255 # Set most significant bit to mark end of links in this node.
261 def encode_prefix(label
):
262 """Encodes a node label as a list of bytes without a trailing high byte.
264 This method encodes a node if there is exactly one child and the
265 child follows immediately after so that no jump is needed. This label
266 will then be a prefix to the label in the child node.
269 return [ord(c
) for c
in reversed(label
)]
272 def encode_label(label
):
273 """Encodes a node label as a list of bytes with a trailing high byte >0x80."""
274 buf
= encode_prefix(label
)
275 # Set most significant bit to mark end of label in this node.
280 def encode(dafsa
: Dafsa
):
281 """Encodes a DAFSA to a list of bytes"""
285 for node
in reversed(top_sort(dafsa
)):
287 len(node
.children
) == 1
288 and not next(iter(node
.children
.values())).is_end_node
289 and (offsets
[id(next(iter(node
.children
.values())))] == len(output
))
291 output
.extend(encode_prefix(node
.character
))
293 output
.extend(encode_links(node
, offsets
, len(output
)))
294 output
.extend(encode_label(node
.character
))
295 offsets
[id(node
)] = len(output
)
297 output
.extend(encode_links(dafsa
.root_node
, offsets
, len(output
)))
302 def to_cxx(data
, preamble
=None):
303 """Generates C++ code from a list of encoded bytes."""
304 text
= "/* This file is generated. DO NOT EDIT!\n\n"
305 text
+= "The byte array encodes a dictionary of strings and values. See "
306 text
+= "make_dafsa.py for documentation."
313 text
+= "const unsigned char kDafsa[%s] = {\n" % len(data
)
314 for i
in range(0, len(data
), 12):
316 text
+= ", ".join("0x%02x" % byte
for byte
in data
[i
: i
+ 12])
322 def words_to_cxx(words
, preamble
=None):
323 """Generates C++ code from a word list"""
324 dafsa
= Dafsa
.from_tld_data(words
)
325 return to_cxx(encode(dafsa
), preamble
)
328 def words_to_bin(words
):
329 """Generates bytes from a word list"""
330 dafsa
= Dafsa
.from_tld_data(words
)
332 return struct
.pack("%dB" % len(data
), *data
)
335 def parse_gperf(infile
):
336 """Parses gperf file and extract strings and return code"""
337 lines
= [line
.strip() for line
in infile
]
339 # Extract the preamble.
340 first_delimeter
= lines
.index("%%")
341 preamble
= "\n".join(lines
[0:first_delimeter
])
343 # Extract strings after the first '%%' and before the second '%%'.
344 begin
= first_delimeter
+ 1
345 end
= lines
.index("%%", begin
)
346 lines
= lines
[begin
:end
]
348 if line
[-3:-1] != ", ":
349 raise InputError('Expected "domainname, <digit>", found "%s"' % line
)
350 # Technically the DAFSA format could support return values in range [0-31],
351 # but the values below are the only with a defined meaning.
352 if line
[-1] not in "0124":
354 'Expected value to be one of {0,1,2,4}, found "%s"' % line
[-1]
356 return (preamble
, [line
[:-3] + line
[-1] for line
in lines
])
359 def main(outfile
, infile
):
360 with
open(infile
, "r") as infile
:
361 preamble
, words
= parse_gperf(infile
)
362 outfile
.write(words_to_cxx(words
, preamble
))
366 if __name__
== "__main__":
367 if len(sys
.argv
) != 3:
368 print("usage: %s infile outfile" % sys
.argv
[0])
371 with
open(sys
.argv
[2], "w") as outfile
:
372 sys
.exit(main(outfile
, sys
.argv
[1]))