4 # Author: David Goodger <goodger@python.org>
5 # Copyright: This program has been placed in the public domain.
8 unicode2subfiles.py -- produce character entity files (reSructuredText
9 substitutions) from the W3C master unicode.xml file.
11 This program extracts character entity and entity set information from a
12 unicode.xml file and produces multiple reStructuredText files (in the current
13 directory) containing substitutions. Entity sets are from ISO 8879 & ISO
14 9573-13 (combined), MathML, and HTML4. One or two files are produced for each
15 entity set; a second file with a "-wide.txt" suffix is produced if there are
16 wide-Unicode characters in the set.
18 The input file, unicode.xml, is maintained as part of the MathML 2
19 Recommentation XML source, and is available from
20 <http://www.w3.org/2003/entities/xml/>.
27 from xml
.parsers
.expat
import ParserCreate
30 usage_msg
= """Usage: %s [unicode.xml]\n"""
32 def usage(prog
, status
=0, msg
=None):
33 sys
.stderr
.write(usage_msg
% prog
)
35 sys
.stderr
.write(msg
+ '\n')
45 'Too many arguments (%s): only 1 expected.' % (len(argv
) - 1))
47 inpath
= 'unicode.xml'
48 if not os
.path
.isfile(inpath
):
49 usage(argv
[0], 1, 'No such file: "%s".' % inpath
)
50 if sys
.version_info
>= (3,0):
51 infile
= open(inpath
, mode
='rb')
57 grouper
= CharacterEntitySetExtractor(infile
)
62 class CharacterEntitySetExtractor
:
65 Extracts character entity information from unicode.xml file, groups it by
66 entity set, and writes out reStructuredText substitution files.
69 unwanted_entity_sets
= ['stix', # unknown, buggy set
73 .. This data file has been placed in the public domain.
74 .. Derived from the Unicode character mappings available from
75 <http://www.w3.org/2003/entities/xml/>.
76 Processed by unicode2rstsubs.py, part of Docutils:
77 <http://docutils.sourceforge.net>.
80 def __init__(self
, infile
):
82 """Input unicode.xml file."""
84 self
.parser
= self
.setup_parser()
88 """Stack of element names. Last is current element."""
91 """Mapping of charent set name to set dict."""
94 """Current character's "id" attribute value."""
96 self
.descriptions
= {}
97 """Mapping of character ID to description."""
99 def setup_parser(self
):
100 parser
= ParserCreate()
101 parser
.StartElementHandler
= self
.StartElementHandler
102 parser
.EndElementHandler
= self
.EndElementHandler
103 parser
.CharacterDataHandler
= self
.CharacterDataHandler
107 self
.parser
.ParseFile(self
.infile
)
109 def StartElementHandler(self
, name
, attributes
):
110 self
.elements
.append(name
)
111 handler
= name
+ '_start'
112 if hasattr(self
, handler
):
113 getattr(self
, handler
)(name
, attributes
)
115 def EndElementHandler(self
, name
):
116 assert self
.elements
[-1] == name
, \
117 'unknown end-tag %r (%r)' % (name
, self
.element
)
119 handler
= name
+ '_end'
120 if hasattr(self
, handler
):
121 getattr(self
, handler
)(name
)
123 def CharacterDataHandler(self
, data
):
124 handler
= self
.elements
[-1] + '_data'
125 if hasattr(self
, handler
):
126 getattr(self
, handler
)(data
)
128 def character_start(self
, name
, attributes
):
129 self
.charid
= attributes
['id']
131 def entity_start(self
, name
, attributes
):
132 set = self
.entity_set_name(attributes
['set'])
135 if set not in self
.sets
:
136 print('bad set: %r' % set)
138 entity
= attributes
['id']
139 assert (entity
not in self
.sets
[set]
140 or self
.sets
[set][entity
] == self
.charid
), \
141 ('sets[%r][%r] == %r (!= %r)'
142 % (set, entity
, self
.sets
[set][entity
], self
.charid
))
143 self
.sets
[set][entity
] = self
.charid
145 def description_data(self
, data
):
146 self
.descriptions
.setdefault(self
.charid
, '')
147 self
.descriptions
[self
.charid
] += data
149 entity_set_name_pat
= re
.compile(r
'[0-9-]*(.+)$')
150 """Pattern to strip ISO numbers off the beginning of set names."""
152 def entity_set_name(self
, name
):
154 Return lowcased and standard-number-free entity set name.
155 Return ``None`` for unwanted entity sets.
157 match
= self
.entity_set_name_pat
.match(name
)
158 name
= match
.group(1).lower()
159 if name
in self
.unwanted_entity_sets
:
161 self
.sets
.setdefault(name
, {})
164 def write_sets(self
):
165 sets
= list(self
.sets
.keys())
167 for set_name
in sets
:
168 self
.write_set(set_name
)
170 def write_set(self
, set_name
, wide
=None):
172 outname
= set_name
+ '-wide.txt'
174 outname
= set_name
+ '.txt'
175 outfile
= open(outname
, 'w')
176 print('writing file "%s"' % outname
)
177 outfile
.write(self
.header
+ '\n')
178 set = self
.sets
[set_name
]
179 entities
= [(e
.lower(), e
) for e
in set.keys()]
182 for _
, entity_name
in entities
:
183 longest
= max(longest
, len(entity_name
))
185 for _
, entity_name
in entities
:
186 has_wide
= self
.write_entity(
187 set, set_name
, entity_name
, outfile
, longest
, wide
) or has_wide
188 if has_wide
and not wide
:
189 self
.write_set(set_name
, 1)
191 def write_entity(self
, set, set_name
, entity_name
, outfile
, longest
,
193 charid
= set[entity_name
]
195 for code
in charid
[1:].split('-'):
196 if int(code
, 16) > 0xFFFF:
197 return 1 # wide-Unicode character
198 codes
= ' '.join(['U+%s' % code
for code
in charid
[1:].split('-')])
199 outfile
.write('.. %-*s unicode:: %s .. %s\n'
200 % (longest
+ 2, '|' + entity_name
+ '|',
201 codes
, self
.descriptions
[charid
]))
204 if __name__
== '__main__':