latex2e writer : Move usepackage hyperref after stylesheet inclusion.
[docutils.git] / tools / dev / unicode2rstsubs.py
blob9161e10c1d3103094a961533e28972893268045a
1 #! /usr/bin/env python
3 # $Id$
4 # Author: David Goodger <goodger@python.org>
5 # Copyright: This program has been placed in the public domain.
7 """
8 unicode2subfiles.py -- produce character entity files (reSructuredText
9 substitutions) from the W3C master unicode.xml file.
11 This program extracts character entity and entity set information from a
12 unicode.xml file and produces multiple reStructuredText files (in the current
13 directory) containing substitutions. Entity sets are from ISO 8879 & ISO
14 9573-13 (combined), MathML, and HTML4. One or two files are produced for each
15 entity set; a second file with a "-wide.txt" suffix is produced if there are
16 wide-Unicode characters in the set.
18 The input file, unicode.xml, is maintained as part of the MathML 2
19 Recommentation XML source, and is available from
20 <http://www.w3.org/2003/entities/xml/>.
21 """
23 import sys
24 import os
25 import optparse
26 import re
27 from xml.parsers.expat import ParserCreate
30 usage_msg = """Usage: %s [unicode.xml]"""
32 def usage(prog, status=0, msg=None):
33 print >>sys.stderr, usage_msg % prog
34 if msg:
35 print >>sys.stderr, msg
36 sys.exit(status)
38 def main(argv=None):
39 if argv is None:
40 argv = sys.argv
41 if len(argv) == 2:
42 inpath = argv[1]
43 elif len(argv) > 2:
44 usage(argv[0], 2,
45 'Too many arguments (%s): only 1 expected.' % (len(argv) - 1))
46 else:
47 inpath = 'unicode.xml'
48 if not os.path.isfile(inpath):
49 usage(argv[0], 1, 'No such file: "%s".' % inpath)
50 infile = open(inpath)
51 process(infile)
53 def process(infile):
54 grouper = CharacterEntitySetExtractor(infile)
55 grouper.group()
56 grouper.write_sets()
59 class CharacterEntitySetExtractor:
61 """
62 Extracts character entity information from unicode.xml file, groups it by
63 entity set, and writes out reStructuredText substitution files.
64 """
66 unwanted_entity_sets = ['stix', # unknown, buggy set
67 'predefined']
69 header = """\
70 .. This data file has been placed in the public domain.
71 .. Derived from the Unicode character mappings available from
72 <http://www.w3.org/2003/entities/xml/>.
73 Processed by unicode2rstsubs.py, part of Docutils:
74 <http://docutils.sourceforge.net>.
75 """
77 def __init__(self, infile):
78 self.infile = infile
79 """Input unicode.xml file."""
81 self.parser = self.setup_parser()
82 """XML parser."""
84 self.elements = []
85 """Stack of element names. Last is current element."""
87 self.sets = {}
88 """Mapping of charent set name to set dict."""
90 self.charid = None
91 """Current character's "id" attribute value."""
93 self.descriptions = {}
94 """Mapping of character ID to description."""
96 def setup_parser(self):
97 parser = ParserCreate()
98 parser.StartElementHandler = self.StartElementHandler
99 parser.EndElementHandler = self.EndElementHandler
100 parser.CharacterDataHandler = self.CharacterDataHandler
101 return parser
103 def group(self):
104 self.parser.ParseFile(self.infile)
106 def StartElementHandler(self, name, attributes):
107 self.elements.append(name)
108 handler = name + '_start'
109 if hasattr(self, handler):
110 getattr(self, handler)(name, attributes)
112 def EndElementHandler(self, name):
113 assert self.elements[-1] == name, \
114 'unknown end-tag %r (%r)' % (name, self.element)
115 self.elements.pop()
116 handler = name + '_end'
117 if hasattr(self, handler):
118 getattr(self, handler)(name)
120 def CharacterDataHandler(self, data):
121 handler = self.elements[-1] + '_data'
122 if hasattr(self, handler):
123 getattr(self, handler)(data)
125 def character_start(self, name, attributes):
126 self.charid = attributes['id']
128 def entity_start(self, name, attributes):
129 set = self.entity_set_name(attributes['set'])
130 if not set:
131 return
132 if not self.sets.has_key(set):
133 print 'bad set: %r' % set
134 return
135 entity = attributes['id']
136 assert (not self.sets[set].has_key(entity)
137 or self.sets[set][entity] == self.charid), \
138 ('sets[%r][%r] == %r (!= %r)'
139 % (set, entity, self.sets[set][entity], self.charid))
140 self.sets[set][entity] = self.charid
142 def description_data(self, data):
143 self.descriptions.setdefault(self.charid, '')
144 self.descriptions[self.charid] += data
146 entity_set_name_pat = re.compile(r'[0-9-]*(.+)$')
147 """Pattern to strip ISO numbers off the beginning of set names."""
149 def entity_set_name(self, name):
151 Return lowcased and standard-number-free entity set name.
152 Return ``None`` for unwanted entity sets.
154 match = self.entity_set_name_pat.match(name)
155 name = match.group(1).lower()
156 if name in self.unwanted_entity_sets:
157 return None
158 self.sets.setdefault(name, {})
159 return name
161 def write_sets(self):
162 sets = self.sets.keys()
163 sets.sort()
164 for set_name in sets:
165 self.write_set(set_name)
167 def write_set(self, set_name, wide=None):
168 if wide:
169 outname = set_name + '-wide.txt'
170 else:
171 outname = set_name + '.txt'
172 outfile = open(outname, 'w')
173 print 'writing file "%s"' % outname
174 print >>outfile, self.header
175 set = self.sets[set_name]
176 entities = [(e.lower(), e) for e in set.keys()]
177 entities.sort()
178 longest = 0
179 for _, entity_name in entities:
180 longest = max(longest, len(entity_name))
181 has_wide = None
182 for _, entity_name in entities:
183 has_wide = self.write_entity(
184 set, set_name, entity_name, outfile, longest, wide) or has_wide
185 if has_wide and not wide:
186 self.write_set(set_name, 1)
188 def write_entity(self, set, set_name, entity_name, outfile, longest,
189 wide=None):
190 charid = set[entity_name]
191 if not wide:
192 for code in charid[1:].split('-'):
193 if int(code, 16) > 0xFFFF:
194 return 1 # wide-Unicode character
195 codes = ' '.join(['U+%s' % code for code in charid[1:].split('-')])
196 print >>outfile, ('.. %-*s unicode:: %s .. %s'
197 % (longest + 2, '|' + entity_name + '|',
198 codes, self.descriptions[charid]))
201 if __name__ == '__main__':
202 sys.exit(main())