Bug 1612605 [wpt PR 21532] - Update wpt metadata, a=testonly
[gecko.git] / netwerk / dns / prepare_tlds.py
blobc4235d1f145bd31fe0f7798bba5da66d59e0d489
1 # This Source Code Form is subject to the terms of the Mozilla Public
2 # License, v. 2.0. If a copy of the MPL was not distributed with this
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 import codecs
6 import encodings.idna
7 import imp
8 import os
9 import re
10 import sys
11 from make_dafsa import words_to_cxx, words_to_bin
13 """
14 Processes a file containing effective TLD data. See the following URL for a
15 description of effective TLDs and of the file format that this script
16 processes (although for the latter you're better off just reading this file's
17 short source code).
19 http://wiki.mozilla.org/Gecko:Effective_TLD_Service
20 """
22 def getEffectiveTLDs(path):
23 file = codecs.open(path, "r", "UTF-8")
24 entries = []
25 domains = set()
26 for line in file:
27 # line always contains a line terminator unless the file is empty
28 if len(line) == 0:
29 raise StopIteration
30 line = line.rstrip()
31 # comment, empty, or superfluous line for explicitness purposes
32 if line.startswith("//") or "." not in line:
33 continue
34 line = re.split(r"[ \t\n]", line, 1)[0]
35 entry = EffectiveTLDEntry(line)
36 domain = entry.domain()
37 assert domain not in domains, \
38 "repeating domain %s makes no sense" % domain
39 domains.add(domain)
40 yield entry
42 def _normalizeHostname(domain):
43 """
44 Normalizes the given domain, component by component. ASCII components are
45 lowercased, while non-ASCII components are processed using the ToASCII
46 algorithm.
47 """
48 def convertLabel(label):
49 if _isASCII(label):
50 return label.lower()
51 return encodings.idna.ToASCII(label).decode("utf-8")
52 return ".".join(map(convertLabel, domain.split(".")))
54 def _isASCII(s):
55 "True if s consists entirely of ASCII characters, false otherwise."
56 for c in s:
57 if ord(c) > 127:
58 return False
59 return True
61 class EffectiveTLDEntry:
62 """
63 Stores an entry in an effective-TLD name file.
64 """
66 _exception = False
67 _wild = False
69 def __init__(self, line):
70 """
71 Creates a TLD entry from a line of data, which must have been stripped of
72 the line ending.
73 """
74 if line.startswith("!"):
75 self._exception = True
76 domain = line[1:]
77 elif line.startswith("*."):
78 self._wild = True
79 domain = line[2:]
80 else:
81 domain = line
82 self._domain = _normalizeHostname(domain)
84 def domain(self):
85 "The domain this represents."
86 return self._domain
88 def exception(self):
89 "True if this entry's domain denotes does not denote an effective TLD."
90 return self._exception
92 def wild(self):
93 "True if this entry represents a class of effective TLDs."
94 return self._wild
97 #################
98 # DO EVERYTHING #
99 #################
101 def main(output, effective_tld_filename, output_format="cxx"):
103 effective_tld_filename is the effective TLD file to parse.
104 based on the output format, either a C++ array of a binary representation
105 of a DAFSA representing the eTLD file is then printed to standard output
106 or a binary file is written to disk.
109 def typeEnum(etld):
111 Maps the flags to the DAFSA's enum types.
113 if etld.exception():
114 return 1
115 elif etld.wild():
116 return 2
117 else:
118 return 0
120 def dafsa_words():
122 make_dafsa expects lines of the form "<domain_name><enum_value>"
124 for etld in getEffectiveTLDs(effective_tld_filename):
125 yield "%s%d" % (etld.domain(), typeEnum(etld))
127 """ words_to_bin() returns a bytes while words_to_cxx() returns string """
128 if output_format == "bin":
129 if sys.version_info[0] >= 3:
130 output = output.buffer
131 output.write(words_to_bin(dafsa_words()))
132 else:
133 output.write(words_to_cxx(dafsa_words()))
137 if __name__ == '__main__':
139 This program can output the DAFSA in two formats:
140 as C++ code that will be included and compiled at build time
141 or as a binary file that will be published in Remote Settings.
143 Flags for format options:
144 "cxx" -> C++ array [default]
145 "bin" -> Binary file
148 output_format = "bin" if "--bin" in sys.argv else "cxx"
149 main(sys.stdout, sys.argv[1], output_format=output_format)