2 # -*- coding: utf-8 -*-
5 # This file is part of LyX, the document processor.
6 # Licence details can be found in the file COPYING.
10 # Full author contact details are available in file CREDITS
12 # This script converts a kmap file from LaTeX commands to unicode characters
13 # The kmap file is read and written in utf8 encoding
16 import os
, re
, string
, sys
, unicodedata
19 return ("Usage: %s unicodesymbolsfile inputfile outputfile\n" % prog_name
+
20 "or %s unicodesymbolsfile <inputfile >outputfile" % prog_name
)
24 sys
.stderr
.write(message
+ '\n')
29 " Remove end of line char(s)."
30 if line
[-2:-1] == '\r':
32 elif line
[-1:] == '\r' or line
[-1:] == '\n':
35 # file with no EOL in last line
40 " Read input file and strip lineendings."
43 line
= input.readline()
46 lines
.append(trim_eol(line
).decode('utf8'))
51 " Escape a word for LyXLex."
52 re_quote
= re
.compile(r
'\s|,')
56 if c
== '\\' or c
== '"' or c
== '#':
57 retval
= retval
+ u
'\\'
59 if re_quote
.match(retval
):
60 return u
'"%s"' % retval
65 " Unescape a LyXLex escaped word."
66 if len(word
) > 1 and word
[0] == '"' and word
[-1] == '"':
75 if word
[i
] == '\\' and i
< stop
- 1:
77 retval
= retval
+ word
[i
]
82 def readsymbols(input):
83 " Build the symbol list from the unicodesymbols file and add some hardcoded symbols."
86 line
= input.readline()
91 if len(tokens
) > 0 and tokens
[0][0] != '#':
93 tokens
[1] = unescape(tokens
[1])
94 if tokens
[0][0:2] == "0x":
95 tokens
[0] = int(tokens
[0][2:], 16)
96 symbols
.append(tokens
)
97 # special cases from .cdef files (e.g. duplicates with different commands)
98 symbols
.append([0x00a1, '\\nobreakspace'])
99 symbols
.append([0x00a7, '\\S'])
100 symbols
.append([0x00a9, '\\copyright'])
101 symbols
.append([0x00b1, '$\\pm$'])
102 symbols
.append([0x00b5, '$\\mu$'])
103 symbols
.append([0x00b7, '$\\cdot$'])
104 symbols
.append([0x00b9, '$\\mathonesuperior$'])
105 symbols
.append([0x00d7, '$\\times$'])
106 symbols
.append([0x00d7, '\\times'])
107 symbols
.append([0x00f7, '\\div'])
108 symbols
.append([0x20ac, '\\euro'])
109 # special caron, see lib/lyx2lyx/lyx_1_5.py for an explanation
110 symbols
.append([0x030c, '\\q', '', 'combining'])
114 def write(output
, lines
):
115 " Write output file with native lineendings."
117 output
.write(line
.encode('utf8') + os
.linesep
)
120 def translate_symbol(unicodesymbols
, symbol
, try_combining
= True):
121 " Translate a symbol from LaTeX to unicode."
122 re_combining
= re
.compile(r
'^[^a-zA-Z]')
125 for i
in unicodesymbols
:
126 # Play safe and don't try combining symbols (not sure if this is
128 if i
[1] == symbol
and (len(i
) < 4 or i
[3].find('combining') < 0):
131 # no direct match, see whether this is a combining sequence
132 for i
in unicodesymbols
:
133 if len(i
) > 3 and i
[3].find('combining') >= 0 and symbol
.find(i
[1]) == 0:
134 # Test whether this is really a combining sequence, e.g.
135 # \"o or \d{o}, and not a symbol like \dh that shares the
136 # beginning with a combining symbol
137 translated
= symbol
[len(i
[1]):]
138 if translated
!= '' and re_combining
.match(translated
):
139 # Really a combining sequence
140 if len(translated
) > 1 and translated
[0] == '{' and translated
[-1] == '}':
141 # Strip braces from things like \d{o}
142 translated
= translated
[1:-1]
144 # for some strange reason \\'\\i does not get
145 # correctly combined, so we try \\'\\i which has an
146 # entry in unicodesymbols
147 combined
= translate_symbol(unicodesymbols
, u
'%s{%s}' % (i
[1], translated
))
150 if len(translated
) > 1:
151 # The base character may be a symbol itself, e.g \"{\i}
152 translated
= translate_symbol(unicodesymbols
, translated
, False)
153 # Play safe and only translate combining sequences with
155 if len(translated
) == 1 and (i
[1] != '\\q' or translated
in ['t', 'd', 'l', 'L']):
156 return unicodedata
.normalize("NFKC", translated
+ unichr(i
[0]))
158 # we founed a combining character, but could not convert the argument to a single character
163 def convert(lines
, unicodesymbols
):
164 " Translate all symbols in lines from LaTeX to unicode."
165 # convert both commented and active entries
166 re_kmap
= re
.compile(r
'^(#?\s*\\kmap\s+\S+\s+)([^\s]+)(.*)$')
167 re_kxmod
= re
.compile(r
'^(#?\s*\\kxmod\s+\S+\s+\S+\s+)([^\s]+)(.*)$')
168 for i
in range(len(lines
)):
169 match
= re_kmap
.match(lines
[i
])
171 match
= re_kxmod
.match(lines
[i
])
173 symbol
= unescape(match
.group(2))
174 if len(symbol
) > 2 and symbol
[-2:] == '{}':
175 # The unicodesymbols file does not include the trailing delimiter {}
176 symbol
= symbol
[0:-2]
177 translated
= translate_symbol(unicodesymbols
, symbol
)
179 lines
[i
] = u
'%s%s%s' % (match
.group(1), match
.group(2), match
.group(3))
181 lines
[i
] = u
'%s%s%s' % (match
.group(1), escape(translated
), match
.group(3))
192 input = open(argv
[2], 'rb')
193 output
= open(argv
[3], 'wb')
195 error(usage(argv
[0]))
196 unicodesymbols
= open(argv
[1], 'rb')
199 symbols
= readsymbols(unicodesymbols
)
201 convert(lines
, symbols
)
212 if __name__
== "__main__":