Add: some doc to PreLoadUnicodePage.
[docutils.git] / tools / unicode2rstsubs.py
blobd5e2598630cc49b867fb93795cda5c44e143bfe2
1 #! /usr/bin/env python
3 # Author: David Goodger
4 # Contact: goodger@users.sourceforge.net
5 # Revision: $Revision$
6 # Date: $Date$
7 # Copyright: This program has been placed in the public domain.
9 """
10 unicode2subfiles.py -- produce character entity files (reSructuredText
11 substitutions) from the MathML master unicode.xml file.
13 This program extracts character entity and entity set information from a
14 unicode.xml file and produces multiple reStructuredText files (in the current
15 directory) containing substitutions. Entity sets are from ISO 8879 & ISO
16 9573-13 (combined), MathML, and HTML4. One or two files are produced for each
17 entity set; a second file with a "-wide.txt" suffix is produced if there are
18 wide-Unicode characters in the set.
20 The input file, unicode.xml, is maintained as part of the MathML 2
21 Recommentation XML source, and is available at
22 <http://www.w3.org/Math/characters/unicode.xml> (as of 2003-06-22).
23 """
25 import sys
26 import os
27 import optparse
28 import re
29 from xml.parsers.expat import ParserCreate
32 usage_msg = """Usage: %s [unicode.xml]"""
34 def usage(prog, status=0, msg=None):
35 print >>sys.stderr, usage_msg % prog
36 if msg:
37 print >>sys.stderr, msg
38 sys.exit(status)
40 def main(argv=None):
41 if argv is None:
42 argv = sys.argv
43 if len(argv) == 2:
44 inpath = argv[1]
45 elif len(argv) > 2:
46 usage(argv[0], 2,
47 'Too many arguments (%s): only 1 expected.' % (len(argv) - 1))
48 else:
49 inpath = 'unicode.xml'
50 if not os.path.isfile(inpath):
51 usage(argv[0], 1, 'No such file: "%s".' % inpath)
52 infile = open(inpath)
53 process(infile)
55 def process(infile):
56 grouper = CharacterEntitySetExtractor(infile)
57 grouper.group()
58 grouper.write_sets()
61 class CharacterEntitySetExtractor:
63 """
64 Extracts character entity information from unicode.xml file, groups it by
65 entity set, and writes out reStructuredText substitution files.
66 """
68 unwanted_entity_sets = ['stix', # unknown, buggy set
69 'predefined']
71 def __init__(self, infile):
72 self.infile = infile
73 """Input unicode.xml file."""
75 self.parser = self.setup_parser()
76 """XML parser."""
78 self.elements = []
79 """Stack of element names. Last is current element."""
81 self.sets = {}
82 """Mapping of charent set name to set dict."""
84 self.charid = None
85 """Current character's "id" attribute value."""
87 self.descriptions = {}
88 """Mapping of character ID to description."""
90 def setup_parser(self):
91 parser = ParserCreate()
92 parser.StartElementHandler = self.StartElementHandler
93 parser.EndElementHandler = self.EndElementHandler
94 parser.CharacterDataHandler = self.CharacterDataHandler
95 return parser
97 def group(self):
98 self.parser.ParseFile(self.infile)
100 def StartElementHandler(self, name, attributes):
101 self.elements.append(name)
102 handler = name + '_start'
103 if hasattr(self, handler):
104 getattr(self, handler)(name, attributes)
106 def EndElementHandler(self, name):
107 assert self.elements[-1] == name, \
108 'unknown end-tag %r (%r)' % (name, self.element)
109 self.elements.pop()
110 handler = name + '_end'
111 if hasattr(self, handler):
112 getattr(self, handler)(name)
114 def CharacterDataHandler(self, data):
115 handler = self.elements[-1] + '_data'
116 if hasattr(self, handler):
117 getattr(self, handler)(data)
119 def character_start(self, name, attributes):
120 self.charid = attributes['id']
122 def entity_start(self, name, attributes):
123 set = self.entity_set_name(attributes['set'])
124 if not set:
125 return
126 if not self.sets.has_key(set):
127 print 'bad set: %r' % set
128 return
129 entity = attributes['id']
130 assert (not self.sets[set].has_key(entity)
131 or self.sets[set][entity] == self.charid), \
132 ('sets[%r][%r] == %r (!= %r)'
133 % (set, entity, self.sets[set][entity], self.charid))
134 self.sets[set][entity] = self.charid
136 def description_data(self, data):
137 self.descriptions.setdefault(self.charid, '')
138 self.descriptions[self.charid] += data
140 entity_set_name_pat = re.compile(r'[0-9-]*(.+)$')
141 """Pattern to strip ISO numbers off the beginning of set names."""
143 def entity_set_name(self, name):
145 Return lowcased and standard-number-free entity set name.
146 Return ``None`` for unwanted entity sets.
148 match = self.entity_set_name_pat.match(name)
149 name = match.group(1).lower()
150 if name in self.unwanted_entity_sets:
151 return None
152 self.sets.setdefault(name, {})
153 return name
155 def write_sets(self):
156 sets = self.sets.keys()
157 sets.sort()
158 for set_name in sets:
159 self.write_set(set_name)
161 def write_set(self, set_name, wide=None):
162 if wide:
163 outname = set_name + '-wide.txt'
164 else:
165 outname = set_name + '.txt'
166 outfile = open(outname, 'w')
167 print 'writing file "%s"' % outname
168 set = self.sets[set_name]
169 entities = [(e.lower(), e) for e in set.keys()]
170 entities.sort()
171 longest = 0
172 for _, entity_name in entities:
173 longest = max(longest, len(entity_name))
174 has_wide = None
175 for _, entity_name in entities:
176 has_wide = self.write_entity(
177 set, set_name, entity_name, outfile, longest, wide) or has_wide
178 if has_wide and not wide:
179 self.write_set(set_name, 1)
181 def write_entity(self, set, set_name, entity_name, outfile, longest,
182 wide=None):
183 charid = set[entity_name]
184 if not wide:
185 for code in charid[1:].split('-'):
186 if int(code, 16) > 0xFFFF:
187 return 1 # wide-Unicode character
188 codes = ' '.join(['U+%s' % code for code in charid[1:].split('-')])
189 print >>outfile, ('.. %-*s unicode:: %s .. %s'
190 % (longest + 2, '|' + entity_name + '|',
191 codes, self.descriptions[charid]))
194 if __name__ == '__main__':
195 sys.exit(main())