3 # Author: David Goodger
4 # Contact: goodger@users.sourceforge.net
7 # Copyright: This program has been placed in the public domain.
10 unicode2subfiles.py -- produce character entity files (reSructuredText
11 substitutions) from the MathML master unicode.xml file.
13 This program extracts character entity and entity set information from a
14 unicode.xml file and produces multiple reStructuredText files (in the current
15 directory) containing substitutions. Entity sets are from ISO 8879 & ISO
16 9573-13 (combined), MathML, and HTML4. One or two files are produced for each
17 entity set; a second file with a "-wide.txt" suffix is produced if there are
18 wide-Unicode characters in the set.
20 The input file, unicode.xml, is maintained as part of the MathML 2
21 Recommentation XML source, and is available at
22 <http://www.w3.org/Math/characters/unicode.xml> (as of 2003-06-22).
29 from xml
.parsers
.expat
import ParserCreate
32 usage_msg
= """Usage: %s [unicode.xml]"""
34 def usage(prog
, status
=0, msg
=None):
35 print >>sys
.stderr
, usage_msg
% prog
37 print >>sys
.stderr
, msg
47 'Too many arguments (%s): only 1 expected.' % (len(argv
) - 1))
49 inpath
= 'unicode.xml'
50 if not os
.path
.isfile(inpath
):
51 usage(argv
[0], 1, 'No such file: "%s".' % inpath
)
56 grouper
= CharacterEntitySetExtractor(infile
)
61 class CharacterEntitySetExtractor
:
64 Extracts character entity information from unicode.xml file, groups it by
65 entity set, and writes out reStructuredText substitution files.
68 unwanted_entity_sets
= ['stix', # unknown, buggy set
71 def __init__(self
, infile
):
73 """Input unicode.xml file."""
75 self
.parser
= self
.setup_parser()
79 """Stack of element names. Last is current element."""
82 """Mapping of charent set name to set dict."""
85 """Current character's "id" attribute value."""
87 self
.descriptions
= {}
88 """Mapping of character ID to description."""
90 def setup_parser(self
):
91 parser
= ParserCreate()
92 parser
.StartElementHandler
= self
.StartElementHandler
93 parser
.EndElementHandler
= self
.EndElementHandler
94 parser
.CharacterDataHandler
= self
.CharacterDataHandler
98 self
.parser
.ParseFile(self
.infile
)
100 def StartElementHandler(self
, name
, attributes
):
101 self
.elements
.append(name
)
102 handler
= name
+ '_start'
103 if hasattr(self
, handler
):
104 getattr(self
, handler
)(name
, attributes
)
106 def EndElementHandler(self
, name
):
107 assert self
.elements
[-1] == name
, \
108 'unknown end-tag %r (%r)' % (name
, self
.element
)
110 handler
= name
+ '_end'
111 if hasattr(self
, handler
):
112 getattr(self
, handler
)(name
)
114 def CharacterDataHandler(self
, data
):
115 handler
= self
.elements
[-1] + '_data'
116 if hasattr(self
, handler
):
117 getattr(self
, handler
)(data
)
119 def character_start(self
, name
, attributes
):
120 self
.charid
= attributes
['id']
122 def entity_start(self
, name
, attributes
):
123 set = self
.entity_set_name(attributes
['set'])
126 if not self
.sets
.has_key(set):
127 print 'bad set: %r' % set
129 entity
= attributes
['id']
130 assert (not self
.sets
[set].has_key(entity
)
131 or self
.sets
[set][entity
] == self
.charid
), \
132 ('sets[%r][%r] == %r (!= %r)'
133 % (set, entity
, self
.sets
[set][entity
], self
.charid
))
134 self
.sets
[set][entity
] = self
.charid
136 def description_data(self
, data
):
137 self
.descriptions
.setdefault(self
.charid
, '')
138 self
.descriptions
[self
.charid
] += data
140 entity_set_name_pat
= re
.compile(r
'[0-9-]*(.+)$')
141 """Pattern to strip ISO numbers off the beginning of set names."""
143 def entity_set_name(self
, name
):
145 Return lowcased and standard-number-free entity set name.
146 Return ``None`` for unwanted entity sets.
148 match
= self
.entity_set_name_pat
.match(name
)
149 name
= match
.group(1).lower()
150 if name
in self
.unwanted_entity_sets
:
152 self
.sets
.setdefault(name
, {})
155 def write_sets(self
):
156 sets
= self
.sets
.keys()
158 for set_name
in sets
:
159 self
.write_set(set_name
)
161 def write_set(self
, set_name
, wide
=None):
163 outname
= set_name
+ '-wide.txt'
165 outname
= set_name
+ '.txt'
166 outfile
= open(outname
, 'w')
167 print 'writing file "%s"' % outname
168 set = self
.sets
[set_name
]
169 entities
= [(e
.lower(), e
) for e
in set.keys()]
172 for _
, entity_name
in entities
:
173 longest
= max(longest
, len(entity_name
))
175 for _
, entity_name
in entities
:
176 has_wide
= self
.write_entity(
177 set, set_name
, entity_name
, outfile
, longest
, wide
) or has_wide
178 if has_wide
and not wide
:
179 self
.write_set(set_name
, 1)
181 def write_entity(self
, set, set_name
, entity_name
, outfile
, longest
,
183 charid
= set[entity_name
]
185 for code
in charid
[1:].split('-'):
186 if int(code
, 16) > 0xFFFF:
187 return 1 # wide-Unicode character
188 codes
= ' '.join(['U+%s' % code
for code
in charid
[1:].split('-')])
189 print >>outfile
, ('.. %-*s unicode:: %s .. %s'
190 % (longest
+ 2, '|' + entity_name
+ '|',
191 codes
, self
.descriptions
[charid
]))
194 if __name__
== '__main__':