Bug 1810189 - Update MOTS for WebGPU: +jimb,+egubler,+nical,+teoxoy. DONTBUILD r...
[gecko.git] / netwerk / dns / prepare_tlds.py
blob53d0bf526d3a79673b471c07dfd1c1ffb1ba9471
1 # This Source Code Form is subject to the terms of the Mozilla Public
2 # License, v. 2.0. If a copy of the MPL was not distributed with this
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 import codecs
6 import encodings.idna
7 import imp
8 import os
9 import re
10 import sys
11 from make_dafsa import words_to_cxx, words_to_bin
13 """
14 Processes a file containing effective TLD data. See the following URL for a
15 description of effective TLDs and of the file format that this script
16 processes (although for the latter you're better off just reading this file's
17 short source code).
19 http://wiki.mozilla.org/Gecko:Effective_TLD_Service
20 """
23 def getEffectiveTLDs(path):
24 file = codecs.open(path, "r", "UTF-8")
25 entries = []
26 domains = set()
27 for line in file:
28 # line always contains a line terminator unless the file is empty
29 if len(line) == 0:
30 raise StopIteration
31 line = line.rstrip()
32 # comment, empty, or superfluous line for explicitness purposes
33 if line.startswith("//") or not line.strip():
34 continue
35 line = re.split(r"[ \t\n]", line, 1)[0]
36 entry = EffectiveTLDEntry(line)
37 domain = entry.domain()
38 assert domain not in domains, "repeating domain %s makes no sense" % domain
39 domains.add(domain)
40 yield entry
43 def _normalizeHostname(domain):
44 """
45 Normalizes the given domain, component by component. ASCII components are
46 lowercased, while non-ASCII components are processed using the ToASCII
47 algorithm.
48 """
50 def convertLabel(label):
51 if _isASCII(label):
52 return label.lower()
53 return encodings.idna.ToASCII(label).decode("utf-8")
55 return ".".join(map(convertLabel, domain.split(".")))
58 def _isASCII(s):
59 "True if s consists entirely of ASCII characters, false otherwise."
60 for c in s:
61 if ord(c) > 127:
62 return False
63 return True
66 class EffectiveTLDEntry:
67 """
68 Stores an entry in an effective-TLD name file.
69 """
71 _exception = False
72 _wild = False
74 def __init__(self, line):
75 """
76 Creates a TLD entry from a line of data, which must have been stripped of
77 the line ending.
78 """
79 if line.startswith("!"):
80 self._exception = True
81 domain = line[1:]
82 elif line.startswith("*."):
83 self._wild = True
84 domain = line[2:]
85 else:
86 domain = line
87 self._domain = _normalizeHostname(domain)
89 def domain(self):
90 "The domain this represents."
91 return self._domain
93 def exception(self):
94 "True if this entry's domain denotes does not denote an effective TLD."
95 return self._exception
97 def wild(self):
98 "True if this entry represents a class of effective TLDs."
99 return self._wild
102 #################
103 # DO EVERYTHING #
104 #################
107 def main(output, effective_tld_filename, output_format="cxx"):
109 effective_tld_filename is the effective TLD file to parse.
110 based on the output format, either a C++ array of a binary representation
111 of a DAFSA representing the eTLD file is then printed to standard output
112 or a binary file is written to disk.
115 def typeEnum(etld):
117 Maps the flags to the DAFSA's enum types.
119 if etld.exception():
120 return 1
121 elif etld.wild():
122 return 2
123 else:
124 return 0
126 def dafsa_words():
128 make_dafsa expects lines of the form "<domain_name><enum_value>"
130 for etld in getEffectiveTLDs(effective_tld_filename):
131 yield "%s%d" % (etld.domain(), typeEnum(etld))
133 """ words_to_bin() returns a bytes while words_to_cxx() returns string """
134 if output_format == "bin":
135 output.write(words_to_bin(dafsa_words()))
136 else:
137 output.write(words_to_cxx(dafsa_words()))
140 if __name__ == "__main__":
142 This program can output the DAFSA in two formats:
143 as C++ code that will be included and compiled at build time
144 or as a binary file that will be published in Remote Settings.
146 Flags for format options:
147 "cxx" -> C++ array [default]
148 "bin" -> Binary file
151 output_format = "bin" if "--bin" in sys.argv else "cxx"
152 main(sys.stdout, sys.argv[1], output_format=output_format)