Bug 1850713: remove duplicated setting of early hint preloader id in `ScriptLoader...
[gecko.git] / xpcom / ds / tools / make_dafsa.py
bloba9cedf78a8c077379ff8aa0395a5360c5c07d6df
1 #!/usr/bin/env python
2 # Copyright 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """
7 A Deterministic acyclic finite state automaton (DAFSA) is a compact
8 representation of an unordered word list (dictionary).
10 http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
12 This python program converts a list of strings to a byte array in C++.
13 This python program fetches strings and return values from a gperf file
14 and generates a C++ file with a byte array representing graph that can be
15 used as a memory efficient replacement for the perfect hash table.
17 The input strings are assumed to consist of printable 7-bit ASCII characters
18 and the return values are assumed to be one digit integers.
20 In this program a DAFSA is a diamond shaped graph starting at a common
21 root node and ending at a common end node. All internal nodes contain
22 a character and each word is represented by the characters in one path from
23 the root node to the end node.
25 The order of the operations is crucial since lookups will be performed
26 starting from the source with no backtracking. Thus a node must have at
27 most one child with a label starting by the same character. The output
28 is also arranged so that all jumps are to increasing addresses, thus forward
29 in memory.
31 The generated output has suffix free decoding so that the sign of leading
32 bits in a link (a reference to a child node) indicate if it has a size of one,
33 two or three bytes and if it is the last outgoing link from the actual node.
34 A node label is terminated by a byte with the leading bit set.
36 The generated byte array can described by the following BNF:
38 <byte> ::= < 8-bit value in range [0x00-0xFF] >
40 <char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
41 <end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
42 <return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
44 <offset1> ::= < byte in range [0x00-0x3F] >
45 <offset2> ::= < byte in range [0x40-0x5F] >
46 <offset3> ::= < byte in range [0x60-0x7F] >
48 <end_offset1> ::= < byte in range [0x80-0xBF] >
49 <end_offset2> ::= < byte in range [0xC0-0xDF] >
50 <end_offset3> ::= < byte in range [0xE0-0xFF] >
52 <prefix> ::= <char>
54 <label> ::= <end_char>
55 | <char> <label>
57 <end_label> ::= <return_value>
58 | <char> <end_label>
60 <offset> ::= <offset1>
61 | <offset2> <byte>
62 | <offset3> <byte> <byte>
64 <end_offset> ::= <end_offset1>
65 | <end_offset2> <byte>
66 | <end_offset3> <byte> <byte>
68 <offsets> ::= <end_offset>
69 | <offset> <offsets>
71 <source> ::= <offsets>
73 <node> ::= <label> <offsets>
74 | <prefix> <node>
75 | <end_label>
77 <dafsa> ::= <source>
78 | <dafsa> <node>
80 Decoding:
82 <char> -> printable 7-bit ASCII character
83 <end_char> & 0x7F -> printable 7-bit ASCII character
84 <return value> & 0x0F -> integer
85 <offset1 & 0x3F> -> integer
86 ((<offset2> & 0x1F>) << 8) + <byte> -> integer
87 ((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
89 end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
90 offset2 and offset3 respectively.
92 The first offset in a list of offsets is the distance in bytes between the
93 offset itself and the first child node. Subsequent offsets are the distance
94 between previous child node and next child node. Thus each offset links a node
95 to a child node. The distance is always counted between start addresses, i.e.
96 first byte in decoded offset or first byte in child node.
98 Example 1:
101 aa, 1
102 a, 2
105 The input is first parsed to a list of words:
106 ["aa1", "a2"]
108 This produces the following graph:
109 [root] --- a --- 0x02 --- [end]
112 - a --- 0x01
114 A C++ representation of the compressed graph is generated:
116 const unsigned char dafsa[7] = {
117 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
120 The bytes in the generated array has the following meaning:
122 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
124 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
125 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
127 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
128 4: 0x82 <return_value> 0x82 & 0x0F -> return 2
130 5: 0x61 <char> label character 0x61 -> match "a"
131 6: 0x81 <return_value> 0x81 & 0x0F -> return 1
133 Example 2:
136 aa, 1
137 bbb, 2
138 baa, 1
141 The input is first parsed to a list of words:
142 ["aa1", "bbb2", "baa1"]
144 This produces the following graph:
145 [root] --- a --- a --- 0x01 --- [end]
146 | / / /
147 | / / /
148 - b --- b --- b --- 0x02
150 A C++ representation of the compressed graph is generated:
152 const unsigned char dafsa[11] = {
153 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
156 The bytes in the generated array has the following meaning:
158 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
159 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
161 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
162 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
163 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
165 5: 0x61 <char> label character 0x61 -> match "a"
166 6: 0x61 <char> label character 0x61 -> match "a"
167 7: 0x81 <return_value> 0x81 & 0x0F -> return 1
169 8: 0x62 <char> label character 0x62 -> match "b"
170 9: 0x62 <char> label character 0x62 -> match "b"
171 10: 0x82 <return_value> 0x82 & 0x0F -> return 2
173 import struct
174 import sys
176 from incremental_dafsa import Dafsa, Node
179 class InputError(Exception):
180 """Exception raised for errors in the input file."""
183 def top_sort(dafsa: Dafsa):
184 """Generates list of nodes in topological sort order."""
185 incoming = {}
187 def count_incoming(node: Node):
188 """Counts incoming references."""
189 if not node.is_end_node:
190 if id(node) not in incoming:
191 incoming[id(node)] = 1
192 for child in node.children.values():
193 count_incoming(child)
194 else:
195 incoming[id(node)] += 1
197 for node in dafsa.root_node.children.values():
198 count_incoming(node)
200 for node in dafsa.root_node.children.values():
201 incoming[id(node)] -= 1
203 waiting = [
204 node for node in dafsa.root_node.children.values() if incoming[id(node)] == 0
206 nodes = []
208 while waiting:
209 node = waiting.pop()
210 assert incoming[id(node)] == 0
211 nodes.append(node)
212 for child in node.children.values():
213 if not child.is_end_node:
214 incoming[id(child)] -= 1
215 if incoming[id(child)] == 0:
216 waiting.append(child)
217 return nodes
220 def encode_links(node: Node, offsets, current):
221 """Encodes a list of children as one, two or three byte offsets."""
222 if next(iter(node.children.values())).is_end_node:
223 # This is an <end_label> node and no links follow such nodes
224 return []
225 guess = 3 * len(node.children)
226 assert node.children
228 children = sorted(node.children.values(), key=lambda x: -offsets[id(x)])
229 while True:
230 offset = current + guess
231 buf = []
232 for child in children:
233 last = len(buf)
234 distance = offset - offsets[id(child)]
235 assert distance > 0 and distance < (1 << 21)
237 if distance < (1 << 6):
238 # A 6-bit offset: "s0xxxxxx"
239 buf.append(distance)
240 elif distance < (1 << 13):
241 # A 13-bit offset: "s10xxxxxxxxxxxxx"
242 buf.append(0x40 | (distance >> 8))
243 buf.append(distance & 0xFF)
244 else:
245 # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
246 buf.append(0x60 | (distance >> 16))
247 buf.append((distance >> 8) & 0xFF)
248 buf.append(distance & 0xFF)
249 # Distance in first link is relative to following record.
250 # Distance in other links are relative to previous link.
251 offset -= distance
252 if len(buf) == guess:
253 break
254 guess = len(buf)
255 # Set most significant bit to mark end of links in this node.
256 buf[last] |= 1 << 7
257 buf.reverse()
258 return buf
261 def encode_prefix(label):
262 """Encodes a node label as a list of bytes without a trailing high byte.
264 This method encodes a node if there is exactly one child and the
265 child follows immediately after so that no jump is needed. This label
266 will then be a prefix to the label in the child node.
268 assert label
269 return [ord(c) for c in reversed(label)]
272 def encode_label(label):
273 """Encodes a node label as a list of bytes with a trailing high byte >0x80."""
274 buf = encode_prefix(label)
275 # Set most significant bit to mark end of label in this node.
276 buf[0] |= 1 << 7
277 return buf
280 def encode(dafsa: Dafsa):
281 """Encodes a DAFSA to a list of bytes"""
282 output = []
283 offsets = {}
285 for node in reversed(top_sort(dafsa)):
286 if (
287 len(node.children) == 1
288 and not next(iter(node.children.values())).is_end_node
289 and (offsets[id(next(iter(node.children.values())))] == len(output))
291 output.extend(encode_prefix(node.character))
292 else:
293 output.extend(encode_links(node, offsets, len(output)))
294 output.extend(encode_label(node.character))
295 offsets[id(node)] = len(output)
297 output.extend(encode_links(dafsa.root_node, offsets, len(output)))
298 output.reverse()
299 return output
302 def to_cxx(data, preamble=None):
303 """Generates C++ code from a list of encoded bytes."""
304 text = "/* This file is generated. DO NOT EDIT!\n\n"
305 text += "The byte array encodes a dictionary of strings and values. See "
306 text += "make_dafsa.py for documentation."
307 text += "*/\n\n"
309 if preamble:
310 text += preamble
311 text += "\n\n"
313 text += "const unsigned char kDafsa[%s] = {\n" % len(data)
314 for i in range(0, len(data), 12):
315 text += " "
316 text += ", ".join("0x%02x" % byte for byte in data[i : i + 12])
317 text += ",\n"
318 text += "};\n"
319 return text
322 def words_to_cxx(words, preamble=None):
323 """Generates C++ code from a word list"""
324 dafsa = Dafsa.from_tld_data(words)
325 return to_cxx(encode(dafsa), preamble)
328 def words_to_bin(words):
329 """Generates bytes from a word list"""
330 dafsa = Dafsa.from_tld_data(words)
331 data = encode(dafsa)
332 return struct.pack("%dB" % len(data), *data)
335 def parse_gperf(infile):
336 """Parses gperf file and extract strings and return code"""
337 lines = [line.strip() for line in infile]
339 # Extract the preamble.
340 first_delimeter = lines.index("%%")
341 preamble = "\n".join(lines[0:first_delimeter])
343 # Extract strings after the first '%%' and before the second '%%'.
344 begin = first_delimeter + 1
345 end = lines.index("%%", begin)
346 lines = lines[begin:end]
347 for line in lines:
348 if line[-3:-1] != ", ":
349 raise InputError('Expected "domainname, <digit>", found "%s"' % line)
350 # Technically the DAFSA format could support return values in range [0-31],
351 # but the values below are the only with a defined meaning.
352 if line[-1] not in "0124":
353 raise InputError(
354 'Expected value to be one of {0,1,2,4}, found "%s"' % line[-1]
356 return (preamble, [line[:-3] + line[-1] for line in lines])
359 def main(outfile, infile):
360 with open(infile, "r") as infile:
361 preamble, words = parse_gperf(infile)
362 outfile.write(words_to_cxx(words, preamble))
363 return 0
366 if __name__ == "__main__":
367 if len(sys.argv) != 3:
368 print("usage: %s infile outfile" % sys.argv[0])
369 sys.exit(1)
371 with open(sys.argv[2], "w") as outfile:
372 sys.exit(main(outfile, sys.argv[1]))