Fix: documentation to literal-block-env and use-verbatim-when-possible
[docutils.git] / tools / dev / create_unimap.py
blob0f733c69fc3497dac83fd20ffffaa0f6a4ed61b0
1 #!/usr/bin/env python
3 # $Id$
4 # Author: Lea Wiemann <LeWiemann@gmail.com>
5 # Copyright: This file has been placed in the public domain.
7 # Call: create_unimap.py < unicode.xml > unicode_latex.py
9 # Get unicode.xml from
10 # <http://www.w3.org/2003/entities/xml/unicode.xml>.
12 from xml.dom import minidom
13 import sys
14 import pprint
16 def w(s):
17 if isinstance(s, unicode):
18 s = s.encode('utf8')
19 sys.stdout.write(s)
21 text_map = {}
22 math_map = {}
24 class Visitor:
26 """Node visitor for contents of unicode.xml."""
28 def visit_character(self, node):
29 for n in node.childNodes:
30 if n.nodeName == 'latex':
31 code = node.attributes['dec'].value
32 if '-' in code:
33 # I don't know what this means, but we probably
34 # don't need it....
35 continue
36 if int(code) < 128:
37 # Wrong (maps "-" to "$-$", which is too wide) and
38 # unnecessary (maps "a" to "{a}").
39 continue
40 latex_code = n.childNodes[0].nodeValue.encode('ascii').strip()
41 if node.attributes['mode'].value == 'math':
42 math_map[unichr(int(code))] = '$%s$' % latex_code
43 else:
44 text_map[unichr(int(code))] = '{%s}' % latex_code
46 def call_visitor(node, visitor=Visitor()):
47 if isinstance(node, minidom.Text):
48 name = 'Text'
49 else:
50 name = node.nodeName.replace('#', '_')
51 if hasattr(visitor, 'visit_' + name):
52 getattr(visitor, 'visit_' + name)(node)
53 for child in node.childNodes:
54 call_visitor(child)
55 if hasattr(visitor, 'depart_' + name):
56 getattr(visitor, 'depart_' + name)(node)
58 document = minidom.parse(sys.stdin)
59 call_visitor(document)
61 unicode_map = math_map
62 unicode_map.update(text_map)
63 # Now unicode_map contains the text entries plus dollar-enclosed math
64 # entries for those chars for which no text entry exists.
66 print '# $%s$' % 'Id'
67 print '# Author: Lea Wiemann <LeWiemann@gmail.com>'
68 print '# Copyright: This file has been placed in the public domain.'
69 print
70 print '# This is a mapping of Unicode characters to LaTeX equivalents.'
71 print '# The information has been extracted from'
72 print '# <http://www.w3.org/2003/entities/xml/unicode.xml>, written by'
73 print '# David Carlisle and Sebastian Rahtz.'
74 print '#'
75 print '# The extraction has been done by the "create_unimap.py" script'
76 print '# located at <http://docutils.sf.net/tools/dev/create_unimap.py>.'
77 print
78 print 'unicode_map = %s' % pprint.pformat(unicode_map, indent=0)