1 # This Source Code Form is subject to the terms of the Mozilla Public
2 # License, v. 2.0. If a copy of the MPL was not distributed with this
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 from make_dafsa
import words_to_cxx
, words_to_bin
14 Processes a file containing effective TLD data. See the following URL for a
15 description of effective TLDs and of the file format that this script
16 processes (although for the latter you're better off just reading this file's
19 http://wiki.mozilla.org/Gecko:Effective_TLD_Service
22 def getEffectiveTLDs(path
):
23 file = codecs
.open(path
, "r", "UTF-8")
27 # line always contains a line terminator unless the file is empty
31 # comment, empty, or superfluous line for explicitness purposes
32 if line
.startswith("//") or "." not in line
:
34 line
= re
.split(r
"[ \t\n]", line
, 1)[0]
35 entry
= EffectiveTLDEntry(line
)
36 domain
= entry
.domain()
37 assert domain
not in domains
, \
38 "repeating domain %s makes no sense" % domain
42 def _normalizeHostname(domain
):
44 Normalizes the given domain, component by component. ASCII components are
45 lowercased, while non-ASCII components are processed using the ToASCII
48 def convertLabel(label
):
51 return encodings
.idna
.ToASCII(label
).decode("utf-8")
52 return ".".join(map(convertLabel
, domain
.split(".")))
55 "True if s consists entirely of ASCII characters, false otherwise."
61 class EffectiveTLDEntry
:
63 Stores an entry in an effective-TLD name file.
69 def __init__(self
, line
):
71 Creates a TLD entry from a line of data, which must have been stripped of
74 if line
.startswith("!"):
75 self
._exception
= True
77 elif line
.startswith("*."):
82 self
._domain
= _normalizeHostname(domain
)
85 "The domain this represents."
89 "True if this entry's domain denotes does not denote an effective TLD."
90 return self
._exception
93 "True if this entry represents a class of effective TLDs."
101 def main(output
, effective_tld_filename
, output_format
="cxx"):
103 effective_tld_filename is the effective TLD file to parse.
104 based on the output format, either a C++ array of a binary representation
105 of a DAFSA representing the eTLD file is then printed to standard output
106 or a binary file is written to disk.
111 Maps the flags to the DAFSA's enum types.
122 make_dafsa expects lines of the form "<domain_name><enum_value>"
124 for etld
in getEffectiveTLDs(effective_tld_filename
):
125 yield "%s%d" % (etld
.domain(), typeEnum(etld
))
127 """ words_to_bin() returns a bytes while words_to_cxx() returns string """
128 if output_format
== "bin":
129 if sys
.version_info
[0] >= 3:
130 output
= output
.buffer
131 output
.write(words_to_bin(dafsa_words()))
133 output
.write(words_to_cxx(dafsa_words()))
137 if __name__
== '__main__':
139 This program can output the DAFSA in two formats:
140 as C++ code that will be included and compiled at build time
141 or as a binary file that will be published in Remote Settings.
143 Flags for format options:
144 "cxx" -> C++ array [default]
148 output_format
= "bin" if "--bin" in sys
.argv
else "cxx"
149 main(sys
.stdout
, sys
.argv
[1], output_format
=output_format
)