Make tools/ compatible with both, Python 2 and 3 without 2to3-conversion.
[docutils.git] / tools / dev / unicode2rstsubs.py
blobb4e65e10908a75b5f493bc68c3ce148263fac985
1 #! /usr/bin/env python
3 # $Id$
4 # Author: David Goodger <goodger@python.org>
5 # Copyright: This program has been placed in the public domain.
7 """
8 unicode2subfiles.py -- produce character entity files (reSructuredText
9 substitutions) from the W3C master unicode.xml file.
11 This program extracts character entity and entity set information from a
12 unicode.xml file and produces multiple reStructuredText files (in the current
13 directory) containing substitutions. Entity sets are from ISO 8879 & ISO
14 9573-13 (combined), MathML, and HTML4. One or two files are produced for each
15 entity set; a second file with a "-wide.txt" suffix is produced if there are
16 wide-Unicode characters in the set.
18 The input file, unicode.xml, is maintained as part of the MathML 2
19 Recommentation XML source, and is available from
20 <http://www.w3.org/2003/entities/xml/>.
21 """
23 import sys
24 import os
25 import optparse
26 import re
27 from xml.parsers.expat import ParserCreate
30 usage_msg = """Usage: %s [unicode.xml]\n"""
32 def usage(prog, status=0, msg=None):
33 sys.stderr.write(usage_msg % prog)
34 if msg:
35 sys.stderr.write(msg + '\n')
36 sys.exit(status)
38 def main(argv=None):
39 if argv is None:
40 argv = sys.argv
41 if len(argv) == 2:
42 inpath = argv[1]
43 elif len(argv) > 2:
44 usage(argv[0], 2,
45 'Too many arguments (%s): only 1 expected.' % (len(argv) - 1))
46 else:
47 inpath = 'unicode.xml'
48 if not os.path.isfile(inpath):
49 usage(argv[0], 1, 'No such file: "%s".' % inpath)
50 if sys.version_info >= (3,0):
51 infile = open(inpath, mode='rb')
52 else:
53 infile = open(inpath)
54 process(infile)
56 def process(infile):
57 grouper = CharacterEntitySetExtractor(infile)
58 grouper.group()
59 grouper.write_sets()
62 class CharacterEntitySetExtractor:
64 """
65 Extracts character entity information from unicode.xml file, groups it by
66 entity set, and writes out reStructuredText substitution files.
67 """
69 unwanted_entity_sets = ['stix', # unknown, buggy set
70 'predefined']
72 header = """\
73 .. This data file has been placed in the public domain.
74 .. Derived from the Unicode character mappings available from
75 <http://www.w3.org/2003/entities/xml/>.
76 Processed by unicode2rstsubs.py, part of Docutils:
77 <http://docutils.sourceforge.net>.
78 """
80 def __init__(self, infile):
81 self.infile = infile
82 """Input unicode.xml file."""
84 self.parser = self.setup_parser()
85 """XML parser."""
87 self.elements = []
88 """Stack of element names. Last is current element."""
90 self.sets = {}
91 """Mapping of charent set name to set dict."""
93 self.charid = None
94 """Current character's "id" attribute value."""
96 self.descriptions = {}
97 """Mapping of character ID to description."""
99 def setup_parser(self):
100 parser = ParserCreate()
101 parser.StartElementHandler = self.StartElementHandler
102 parser.EndElementHandler = self.EndElementHandler
103 parser.CharacterDataHandler = self.CharacterDataHandler
104 return parser
106 def group(self):
107 self.parser.ParseFile(self.infile)
109 def StartElementHandler(self, name, attributes):
110 self.elements.append(name)
111 handler = name + '_start'
112 if hasattr(self, handler):
113 getattr(self, handler)(name, attributes)
115 def EndElementHandler(self, name):
116 assert self.elements[-1] == name, \
117 'unknown end-tag %r (%r)' % (name, self.element)
118 self.elements.pop()
119 handler = name + '_end'
120 if hasattr(self, handler):
121 getattr(self, handler)(name)
123 def CharacterDataHandler(self, data):
124 handler = self.elements[-1] + '_data'
125 if hasattr(self, handler):
126 getattr(self, handler)(data)
128 def character_start(self, name, attributes):
129 self.charid = attributes['id']
131 def entity_start(self, name, attributes):
132 set = self.entity_set_name(attributes['set'])
133 if not set:
134 return
135 if set not in self.sets:
136 print('bad set: %r' % set)
137 return
138 entity = attributes['id']
139 assert (entity not in self.sets[set]
140 or self.sets[set][entity] == self.charid), \
141 ('sets[%r][%r] == %r (!= %r)'
142 % (set, entity, self.sets[set][entity], self.charid))
143 self.sets[set][entity] = self.charid
145 def description_data(self, data):
146 self.descriptions.setdefault(self.charid, '')
147 self.descriptions[self.charid] += data
149 entity_set_name_pat = re.compile(r'[0-9-]*(.+)$')
150 """Pattern to strip ISO numbers off the beginning of set names."""
152 def entity_set_name(self, name):
154 Return lowcased and standard-number-free entity set name.
155 Return ``None`` for unwanted entity sets.
157 match = self.entity_set_name_pat.match(name)
158 name = match.group(1).lower()
159 if name in self.unwanted_entity_sets:
160 return None
161 self.sets.setdefault(name, {})
162 return name
164 def write_sets(self):
165 sets = list(self.sets.keys())
166 sets.sort()
167 for set_name in sets:
168 self.write_set(set_name)
170 def write_set(self, set_name, wide=None):
171 if wide:
172 outname = set_name + '-wide.txt'
173 else:
174 outname = set_name + '.txt'
175 outfile = open(outname, 'w')
176 print('writing file "%s"' % outname)
177 outfile.write(self.header + '\n')
178 set = self.sets[set_name]
179 entities = [(e.lower(), e) for e in set.keys()]
180 entities.sort()
181 longest = 0
182 for _, entity_name in entities:
183 longest = max(longest, len(entity_name))
184 has_wide = None
185 for _, entity_name in entities:
186 has_wide = self.write_entity(
187 set, set_name, entity_name, outfile, longest, wide) or has_wide
188 if has_wide and not wide:
189 self.write_set(set_name, 1)
191 def write_entity(self, set, set_name, entity_name, outfile, longest,
192 wide=None):
193 charid = set[entity_name]
194 if not wide:
195 for code in charid[1:].split('-'):
196 if int(code, 16) > 0xFFFF:
197 return 1 # wide-Unicode character
198 codes = ' '.join(['U+%s' % code for code in charid[1:].split('-')])
199 outfile.write('.. %-*s unicode:: %s .. %s\n'
200 % (longest + 2, '|' + entity_name + '|',
201 codes, self.descriptions[charid]))
204 if __name__ == '__main__':
205 sys.exit(main())