4 # Author: David Goodger <goodger@python.org>
5 # Copyright: This program has been placed in the public domain.
8 unicode2subfiles.py -- produce character entity files (reSructuredText
9 substitutions) from the W3C master unicode.xml file.
11 This program extracts character entity and entity set information from a
12 unicode.xml file and produces multiple reStructuredText files (in the current
13 directory) containing substitutions. Entity sets are from ISO 8879 & ISO
14 9573-13 (combined), MathML, and HTML4. One or two files are produced for each
15 entity set; a second file with a "-wide.txt" suffix is produced if there are
16 wide-Unicode characters in the set.
18 The input file, unicode.xml, is maintained as part of the MathML 2
19 Recommentation XML source, and is available from
20 <http://www.w3.org/2003/entities/xml/>.
27 from xml
.parsers
.expat
import ParserCreate
30 usage_msg
= """Usage: %s [unicode.xml]"""
32 def usage(prog
, status
=0, msg
=None):
33 print >>sys
.stderr
, usage_msg
% prog
35 print >>sys
.stderr
, msg
45 'Too many arguments (%s): only 1 expected.' % (len(argv
) - 1))
47 inpath
= 'unicode.xml'
48 if not os
.path
.isfile(inpath
):
49 usage(argv
[0], 1, 'No such file: "%s".' % inpath
)
54 grouper
= CharacterEntitySetExtractor(infile
)
59 class CharacterEntitySetExtractor
:
62 Extracts character entity information from unicode.xml file, groups it by
63 entity set, and writes out reStructuredText substitution files.
66 unwanted_entity_sets
= ['stix', # unknown, buggy set
70 .. This data file has been placed in the public domain.
71 .. Derived from the Unicode character mappings available from
72 <http://www.w3.org/2003/entities/xml/>.
73 Processed by unicode2rstsubs.py, part of Docutils:
74 <http://docutils.sourceforge.net>.
77 def __init__(self
, infile
):
79 """Input unicode.xml file."""
81 self
.parser
= self
.setup_parser()
85 """Stack of element names. Last is current element."""
88 """Mapping of charent set name to set dict."""
91 """Current character's "id" attribute value."""
93 self
.descriptions
= {}
94 """Mapping of character ID to description."""
96 def setup_parser(self
):
97 parser
= ParserCreate()
98 parser
.StartElementHandler
= self
.StartElementHandler
99 parser
.EndElementHandler
= self
.EndElementHandler
100 parser
.CharacterDataHandler
= self
.CharacterDataHandler
104 self
.parser
.ParseFile(self
.infile
)
106 def StartElementHandler(self
, name
, attributes
):
107 self
.elements
.append(name
)
108 handler
= name
+ '_start'
109 if hasattr(self
, handler
):
110 getattr(self
, handler
)(name
, attributes
)
112 def EndElementHandler(self
, name
):
113 assert self
.elements
[-1] == name
, \
114 'unknown end-tag %r (%r)' % (name
, self
.element
)
116 handler
= name
+ '_end'
117 if hasattr(self
, handler
):
118 getattr(self
, handler
)(name
)
120 def CharacterDataHandler(self
, data
):
121 handler
= self
.elements
[-1] + '_data'
122 if hasattr(self
, handler
):
123 getattr(self
, handler
)(data
)
125 def character_start(self
, name
, attributes
):
126 self
.charid
= attributes
['id']
128 def entity_start(self
, name
, attributes
):
129 set = self
.entity_set_name(attributes
['set'])
132 if not self
.sets
.has_key(set):
133 print 'bad set: %r' % set
135 entity
= attributes
['id']
136 assert (not self
.sets
[set].has_key(entity
)
137 or self
.sets
[set][entity
] == self
.charid
), \
138 ('sets[%r][%r] == %r (!= %r)'
139 % (set, entity
, self
.sets
[set][entity
], self
.charid
))
140 self
.sets
[set][entity
] = self
.charid
142 def description_data(self
, data
):
143 self
.descriptions
.setdefault(self
.charid
, '')
144 self
.descriptions
[self
.charid
] += data
146 entity_set_name_pat
= re
.compile(r
'[0-9-]*(.+)$')
147 """Pattern to strip ISO numbers off the beginning of set names."""
149 def entity_set_name(self
, name
):
151 Return lowcased and standard-number-free entity set name.
152 Return ``None`` for unwanted entity sets.
154 match
= self
.entity_set_name_pat
.match(name
)
155 name
= match
.group(1).lower()
156 if name
in self
.unwanted_entity_sets
:
158 self
.sets
.setdefault(name
, {})
161 def write_sets(self
):
162 sets
= self
.sets
.keys()
164 for set_name
in sets
:
165 self
.write_set(set_name
)
167 def write_set(self
, set_name
, wide
=None):
169 outname
= set_name
+ '-wide.txt'
171 outname
= set_name
+ '.txt'
172 outfile
= open(outname
, 'w')
173 print 'writing file "%s"' % outname
174 print >>outfile
, self
.header
175 set = self
.sets
[set_name
]
176 entities
= [(e
.lower(), e
) for e
in set.keys()]
179 for _
, entity_name
in entities
:
180 longest
= max(longest
, len(entity_name
))
182 for _
, entity_name
in entities
:
183 has_wide
= self
.write_entity(
184 set, set_name
, entity_name
, outfile
, longest
, wide
) or has_wide
185 if has_wide
and not wide
:
186 self
.write_set(set_name
, 1)
188 def write_entity(self
, set, set_name
, entity_name
, outfile
, longest
,
190 charid
= set[entity_name
]
192 for code
in charid
[1:].split('-'):
193 if int(code
, 16) > 0xFFFF:
194 return 1 # wide-Unicode character
195 codes
= ' '.join(['U+%s' % code
for code
in charid
[1:].split('-')])
196 print >>outfile
, ('.. %-*s unicode:: %s .. %s'
197 % (longest
+ 2, '|' + entity_name
+ '|',
198 codes
, self
.descriptions
[charid
]))
201 if __name__
== '__main__':