1 # This Source Code Form is subject to the terms of the Mozilla Public
2 # License, v. 2.0. If a copy of the MPL was not distributed with this
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 from make_dafsa
import words_to_cxx
, words_to_bin
14 Processes a file containing effective TLD data. See the following URL for a
15 description of effective TLDs and of the file format that this script
16 processes (although for the latter you're better off just reading this file's
19 http://wiki.mozilla.org/Gecko:Effective_TLD_Service
23 def getEffectiveTLDs(path
):
24 file = codecs
.open(path
, "r", "UTF-8")
28 # line always contains a line terminator unless the file is empty
32 # comment, empty, or superfluous line for explicitness purposes
33 if line
.startswith("//") or not line
.strip():
35 line
= re
.split(r
"[ \t\n]", line
, 1)[0]
36 entry
= EffectiveTLDEntry(line
)
37 domain
= entry
.domain()
38 assert domain
not in domains
, "repeating domain %s makes no sense" % domain
43 def _normalizeHostname(domain
):
45 Normalizes the given domain, component by component. ASCII components are
46 lowercased, while non-ASCII components are processed using the ToASCII
50 def convertLabel(label
):
53 return encodings
.idna
.ToASCII(label
).decode("utf-8")
55 return ".".join(map(convertLabel
, domain
.split(".")))
59 "True if s consists entirely of ASCII characters, false otherwise."
66 class EffectiveTLDEntry
:
68 Stores an entry in an effective-TLD name file.
74 def __init__(self
, line
):
76 Creates a TLD entry from a line of data, which must have been stripped of
79 if line
.startswith("!"):
80 self
._exception
= True
82 elif line
.startswith("*."):
87 self
._domain
= _normalizeHostname(domain
)
90 "The domain this represents."
94 "True if this entry's domain denotes does not denote an effective TLD."
95 return self
._exception
98 "True if this entry represents a class of effective TLDs."
107 def main(output
, effective_tld_filename
, output_format
="cxx"):
109 effective_tld_filename is the effective TLD file to parse.
110 based on the output format, either a C++ array of a binary representation
111 of a DAFSA representing the eTLD file is then printed to standard output
112 or a binary file is written to disk.
117 Maps the flags to the DAFSA's enum types.
128 make_dafsa expects lines of the form "<domain_name><enum_value>"
130 for etld
in getEffectiveTLDs(effective_tld_filename
):
131 yield "%s%d" % (etld
.domain(), typeEnum(etld
))
133 """ words_to_bin() returns a bytes while words_to_cxx() returns string """
134 if output_format
== "bin":
135 output
.write(words_to_bin(dafsa_words()))
137 output
.write(words_to_cxx(dafsa_words()))
140 if __name__
== "__main__":
142 This program can output the DAFSA in two formats:
143 as C++ code that will be included and compiled at build time
144 or as a binary file that will be published in Remote Settings.
146 Flags for format options:
147 "cxx" -> C++ array [default]
151 output_format
= "bin" if "--bin" in sys
.argv
else "cxx"
152 main(sys
.stdout
, sys
.argv
[1], output_format
=output_format
)