xpcom/ds/tools/make_dafsa.py

   1 #!/usr/bin/env python
   2 # Copyright 2014 The Chromium Authors. All rights reserved.
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5
   6 """
   7 A Deterministic acyclic finite state automaton (DAFSA) is a compact
   8 representation of an unordered word list (dictionary).
   9
  10 http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
  11
  12 This python program converts a list of strings to a byte array in C++.
  13 This python program fetches strings and return values from a gperf file
  14 and generates a C++ file with a byte array representing graph that can be
  15 used as a memory efficient replacement for the perfect hash table.
  16
  17 The input strings are assumed to consist of printable 7-bit ASCII characters
  18 and the return values are assumed to be one digit integers.
  19
  20 In this program a DAFSA is a diamond shaped graph starting at a common
  21 root node and ending at a common end node. All internal nodes contain
  22 a character and each word is represented by the characters in one path from
  23 the root node to the end node.
  24
  25 The order of the operations is crucial since lookups will be performed
  26 starting from the source with no backtracking. Thus a node must have at
  27 most one child with a label starting by the same character. The output
  28 is also arranged so that all jumps are to increasing addresses, thus forward
  29 in memory.
  30
  31 The generated output has suffix free decoding so that the sign of leading
  32 bits in a link (a reference to a child node) indicate if it has a size of one,
  33 two or three bytes and if it is the last outgoing link from the actual node.
  34 A node label is terminated by a byte with the leading bit set.
  35
  36 The generated byte array can described by the following BNF:
  37
  38 <byte> ::= < 8-bit value in range [0x00-0xFF] >
  39
  40 <char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
  41 <end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
  42 <return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
  43
  44 <offset1> ::= < byte in range [0x00-0x3F] >
  45 <offset2> ::= < byte in range [0x40-0x5F] >
  46 <offset3> ::= < byte in range [0x60-0x7F] >
  47
  48 <end_offset1> ::= < byte in range [0x80-0xBF] >
  49 <end_offset2> ::= < byte in range [0xC0-0xDF] >
  50 <end_offset3> ::= < byte in range [0xE0-0xFF] >
  51
  52 <prefix> ::= <char>
  53
  54 <label> ::= <end_char>
  55           | <char> <label>
  56
  57 <end_label> ::= <return_value>
  58           | <char> <end_label>
  59
  60 <offset> ::= <offset1>
  61            | <offset2> <byte>
  62            | <offset3> <byte> <byte>
  63
  64 <end_offset> ::= <end_offset1>
  65                | <end_offset2> <byte>
  66                | <end_offset3> <byte> <byte>
  67
  68 <offsets> ::= <end_offset>
  69             | <offset> <offsets>
  70
  71 <source> ::= <offsets>
  72
  73 <node> ::= <label> <offsets>
  74          | <prefix> <node>
  75          | <end_label>
  76
  77 <dafsa> ::= <source>
  78           | <dafsa> <node>
  79
  80 Decoding:
  81
  82 <char> -> printable 7-bit ASCII character
  83 <end_char> & 0x7F -> printable 7-bit ASCII character
  84 <return value> & 0x0F -> integer
  85 <offset1 & 0x3F> -> integer
  86 ((<offset2> & 0x1F>) << 8) + <byte> -> integer
  87 ((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
  88
  89 end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
  90 offset2 and offset3 respectively.
  91
  92 The first offset in a list of offsets is the distance in bytes between the
  93 offset itself and the first child node. Subsequent offsets are the distance
  94 between previous child node and next child node. Thus each offset links a node
  95 to a child node. The distance is always counted between start addresses, i.e.
  96 first byte in decoded offset or first byte in child node.
  97
  98 Example 1:
  99
 100 %%
 101 aa, 1
 102 a, 2
 103 %%
 104
 105 The input is first parsed to a list of words:
 106 ["aa1", "a2"]
 107
 108 This produces the following graph:
 109 [root] --- a --- 0x02 --- [end]
 110             |              /
 111              |           /
 112               - a --- 0x01
 113
 114 A C++ representation of the compressed graph is generated:
 115
 116 const unsigned char dafsa[7] = {
 117   0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
 118 };
 119
 120 The bytes in the generated array has the following meaning:
 121
 122  0: 0x81 <end_offset1>  child at position 0 + (0x81 & 0x3F) -> jump to 1
 123
 124  1: 0xE1 <end_char>     label character (0xE1 & 0x7F) -> match "a"
 125  2: 0x02 <offset1>      child at position 2 + (0x02 & 0x3F) -> jump to 4
 126
 127  3: 0x81 <end_offset1>  child at position 4 + (0x81 & 0x3F) -> jump to 5
 128  4: 0x82 <return_value> 0x82 & 0x0F -> return 2
 129
 130  5: 0x61 <char>         label character 0x61 -> match "a"
 131  6: 0x81 <return_value> 0x81 & 0x0F -> return 1
 132
 133 Example 2:
 134
 135 %%
 136 aa, 1
 137 bbb, 2
 138 baa, 1
 139 %%
 140
 141 The input is first parsed to a list of words:
 142 ["aa1", "bbb2", "baa1"]
 143
 144 This produces the following graph:
 145 [root] --- a --- a --- 0x01 --- [end]
 146  |       /           /         /
 147   |    /           /         /
 148    - b --- b --- b --- 0x02
 149
 150 A C++ representation of the compressed graph is generated:
 151
 152 const unsigned char dafsa[11] = {
 153   0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
 154 };
 155
 156 The bytes in the generated array has the following meaning:
 157
 158  0: 0x02 <offset1>      child at position 0 + (0x02 & 0x3F) -> jump to 2
 159  1: 0x83 <end_offset1>  child at position 2 + (0x83 & 0x3F) -> jump to 5
 160
 161  2: 0xE2 <end_char>     label character (0xE2 & 0x7F) -> match "b"
 162  3: 0x02 <offset1>      child at position 3 + (0x02 & 0x3F) -> jump to 5
 163  4: 0x83 <end_offset1>  child at position 5 + (0x83 & 0x3F) -> jump to 8
 164
 165  5: 0x61 <char>         label character 0x61 -> match "a"
 166  6: 0x61 <char>         label character 0x61 -> match "a"
 167  7: 0x81 <return_value> 0x81 & 0x0F -> return 1
 168
 169  8: 0x62 <char>         label character 0x62 -> match "b"
 170  9: 0x62 <char>         label character 0x62 -> match "b"
 171 10: 0x82 <return_value> 0x82 & 0x0F -> return 2
 172 """
 173 import struct
 174 import sys
 175
 176 from incremental_dafsa import Dafsa, Node
 177
 178
 179 class InputError(Exception):
 180     """Exception raised for errors in the input file."""
 181
 182
 183 def top_sort(dafsa: Dafsa):
 184     """Generates list of nodes in topological sort order."""
 185     incoming = {}
 186
 187     def count_incoming(node: Node):
 188         """Counts incoming references."""
 189         if not node.is_end_node:
 190             if id(node) not in incoming:
 191                 incoming[id(node)] = 1
 192                 for child in node.children.values():
 193                     count_incoming(child)
 194             else:
 195                 incoming[id(node)] += 1
 196
 197     for node in dafsa.root_node.children.values():
 198         count_incoming(node)
 199
 200     for node in dafsa.root_node.children.values():
 201         incoming[id(node)] -= 1
 202
 203     waiting = [
 204         node for node in dafsa.root_node.children.values() if incoming[id(node)] == 0
 205     ]
 206     nodes = []
 207
 208     while waiting:
 209         node = waiting.pop()
 210         assert incoming[id(node)] == 0
 211         nodes.append(node)
 212         for child in node.children.values():
 213             if not child.is_end_node:
 214                 incoming[id(child)] -= 1
 215                 if incoming[id(child)] == 0:
 216                     waiting.append(child)
 217     return nodes
 218
 219
 220 def encode_links(node: Node, offsets, current):
 221     """Encodes a list of children as one, two or three byte offsets."""
 222     if next(iter(node.children.values())).is_end_node:
 223         # This is an <end_label> node and no links follow such nodes
 224         return []
 225     guess = 3 * len(node.children)
 226     assert node.children
 227
 228     children = sorted(node.children.values(), key=lambda x: -offsets[id(x)])
 229     while True:
 230         offset = current + guess
 231         buf = []
 232         for child in children:
 233             last = len(buf)
 234             distance = offset - offsets[id(child)]
 235             assert distance > 0 and distance < (1 << 21)
 236
 237             if distance < (1 << 6):
 238                 # A 6-bit offset: "s0xxxxxx"
 239                 buf.append(distance)
 240             elif distance < (1 << 13):
 241                 # A 13-bit offset: "s10xxxxxxxxxxxxx"
 242                 buf.append(0x40 | (distance >> 8))
 243                 buf.append(distance & 0xFF)
 244             else:
 245                 # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
 246                 buf.append(0x60 | (distance >> 16))
 247                 buf.append((distance >> 8) & 0xFF)
 248                 buf.append(distance & 0xFF)
 249             # Distance in first link is relative to following record.
 250             # Distance in other links are relative to previous link.
 251             offset -= distance
 252         if len(buf) == guess:
 253             break
 254         guess = len(buf)
 255     # Set most significant bit to mark end of links in this node.
 256     buf[last] |= 1 << 7
 257     buf.reverse()
 258     return buf
 259
 260
 261 def encode_prefix(label):
 262     """Encodes a node label as a list of bytes without a trailing high byte.
 263
 264     This method encodes a node if there is exactly one child  and the
 265     child follows immediately after so that no jump is needed. This label
 266     will then be a prefix to the label in the child node.
 267     """
 268     assert label
 269     return [ord(c) for c in reversed(label)]
 270
 271
 272 def encode_label(label):
 273     """Encodes a node label as a list of bytes with a trailing high byte >0x80."""
 274     buf = encode_prefix(label)
 275     # Set most significant bit to mark end of label in this node.
 276     buf[0] |= 1 << 7
 277     return buf
 278
 279
 280 def encode(dafsa: Dafsa):
 281     """Encodes a DAFSA to a list of bytes"""
 282     output = []
 283     offsets = {}
 284
 285     for node in reversed(top_sort(dafsa)):
 286         if (
 287             len(node.children) == 1
 288             and not next(iter(node.children.values())).is_end_node
 289             and (offsets[id(next(iter(node.children.values())))] == len(output))
 290         ):
 291             output.extend(encode_prefix(node.character))
 292         else:
 293             output.extend(encode_links(node, offsets, len(output)))
 294             output.extend(encode_label(node.character))
 295         offsets[id(node)] = len(output)
 296
 297     output.extend(encode_links(dafsa.root_node, offsets, len(output)))
 298     output.reverse()
 299     return output
 300
 301
 302 def to_cxx(data, preamble=None):
 303     """Generates C++ code from a list of encoded bytes."""
 304     text = "/* This file is generated. DO NOT EDIT!\n\n"
 305     text += "The byte array encodes a dictionary of strings and values. See "
 306     text += "make_dafsa.py for documentation."
 307     text += "*/\n\n"
 308
 309     if preamble:
 310         text += preamble
 311         text += "\n\n"
 312
 313     text += "const unsigned char kDafsa[%s] = {\n" % len(data)
 314     for i in range(0, len(data), 12):
 315         text += "  "
 316         text += ", ".join("0x%02x" % byte for byte in data[i : i + 12])
 317         text += ",\n"
 318     text += "};\n"
 319     return text
 320
 321
 322 def words_to_cxx(words, preamble=None):
 323     """Generates C++ code from a word list"""
 324     dafsa = Dafsa.from_tld_data(words)
 325     return to_cxx(encode(dafsa), preamble)
 326
 327
 328 def words_to_bin(words):
 329     """Generates bytes from a word list"""
 330     dafsa = Dafsa.from_tld_data(words)
 331     data = encode(dafsa)
 332     return struct.pack("%dB" % len(data), *data)
 333
 334
 335 def parse_gperf(infile):
 336     """Parses gperf file and extract strings and return code"""
 337     lines = [line.strip() for line in infile]
 338
 339     # Extract the preamble.
 340     first_delimeter = lines.index("%%")
 341     preamble = "\n".join(lines[0:first_delimeter])
 342
 343     # Extract strings after the first '%%' and before the second '%%'.
 344     begin = first_delimeter + 1
 345     end = lines.index("%%", begin)
 346     lines = lines[begin:end]
 347     for line in lines:
 348         if line[-3:-1] != ", ":
 349             raise InputError('Expected "domainname, <digit>", found "%s"' % line)
 350         # Technically the DAFSA format could support return values in range [0-31],
 351         # but the values below are the only with a defined meaning.
 352         if line[-1] not in "0124":
 353             raise InputError(
 354                 'Expected value to be one of {0,1,2,4}, found "%s"' % line[-1]
 355             )
 356     return (preamble, [line[:-3] + line[-1] for line in lines])
 357
 358
 359 def main(outfile, infile):
 360     with open(infile, "r") as infile:
 361         preamble, words = parse_gperf(infile)
 362         outfile.write(words_to_cxx(words, preamble))
 363     return 0
 364
 365
 366 if __name__ == "__main__":
 367     if len(sys.argv) != 3:
 368         print("usage: %s infile outfile" % sys.argv[0])
 369         sys.exit(1)
 370
 371     with open(sys.argv[2], "w") as outfile:
 372         sys.exit(main(outfile, sys.argv[1]))