development/tools/convert_kmap.py

   1 #! /usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # file convert_kmap.py
   5 # This file is part of LyX, the document processor.
   6 # Licence details can be found in the file COPYING.
   7
   8 # author Georg Baum
   9
  10 # Full author contact details are available in file CREDITS
  11
  12 # This script converts a kmap file from LaTeX commands to unicode characters
  13 # The kmap file is read and written in utf8 encoding
  14
  15
  16 import os, re, string, sys, unicodedata
  17
  18 def usage(prog_name):
  19     return ("Usage: %s unicodesymbolsfile inputfile outputfile\n" % prog_name +
  20             "or     %s unicodesymbolsfile <inputfile >outputfile" % prog_name)
  21
  22
  23 def error(message):
  24     sys.stderr.write(message + '\n')
  25     sys.exit(1)
  26
  27
  28 def trim_eol(line):
  29     " Remove end of line char(s)."
  30     if line[-2:-1] == '\r':
  31         return line[:-2]
  32     elif line[-1:] == '\r' or line[-1:] == '\n':
  33         return line[:-1]
  34     else:
  35         # file with no EOL in last line
  36         return line
  37
  38
  39 def read(input):
  40     " Read input file and strip lineendings."
  41     lines = list()
  42     while 1:
  43         line = input.readline()
  44         if not line:
  45             break
  46         lines.append(trim_eol(line).decode('utf8'))
  47     return lines
  48
  49
  50 def escape(word):
  51     " Escape a word for LyXLex."
  52     re_quote = re.compile(r'\s|,')
  53     retval = u''
  54     i = 0
  55     for c in word:
  56         if c == '\\' or c == '"' or c == '#':
  57             retval = retval + u'\\'
  58         retval = retval + c
  59     if re_quote.match(retval):
  60         return u'"%s"' % retval
  61     return retval
  62
  63
  64 def unescape(word):
  65     " Unescape a LyXLex escaped word."
  66     if len(word) > 1 and word[0] == '"' and word[-1] == '"':
  67         start = 1
  68         stop = len(word) - 1
  69     else:
  70         start = 0
  71         stop = len(word)
  72     retval = u''
  73     i = start
  74     while i < stop:
  75         if word[i] == '\\' and i < stop - 1:
  76             i = i + 1
  77         retval = retval + word[i]
  78         i = i + 1
  79     return retval
  80
  81
  82 def readsymbols(input):
  83     " Build the symbol list from the unicodesymbols file and add some hardcoded symbols."
  84     symbols = list()
  85     while 1:
  86         line = input.readline()
  87         if not line:
  88             break
  89         line = trim_eol(line)
  90         tokens = line.split()
  91         if len(tokens) > 0 and tokens[0][0] != '#':
  92             if len(tokens) > 1:
  93                 tokens[1] = unescape(tokens[1])
  94             if tokens[0][0:2] == "0x":
  95                 tokens[0] = int(tokens[0][2:], 16)
  96                 symbols.append(tokens)
  97     # special cases from .cdef files (e.g. duplicates with different commands)
  98     symbols.append([0x00a1, '\\nobreakspace'])
  99     symbols.append([0x00a7, '\\S'])
 100     symbols.append([0x00a9, '\\copyright'])
 101     symbols.append([0x00b1, '$\\pm$'])
 102     symbols.append([0x00b5, '$\\mu$'])
 103     symbols.append([0x00b7, '$\\cdot$'])
 104     symbols.append([0x00b9, '$\\mathonesuperior$'])
 105     symbols.append([0x00d7, '$\\times$'])
 106     symbols.append([0x00d7, '\\times'])
 107     symbols.append([0x00f7, '\\div'])
 108     symbols.append([0x20ac, '\\euro'])
 109     # special caron, see lib/lyx2lyx/lyx_1_5.py for an explanation
 110     symbols.append([0x030c, '\\q', '', 'combining'])
 111     return symbols
 112
 113
 114 def write(output, lines):
 115     " Write output file with native lineendings."
 116     for line in lines:
 117         output.write(line.encode('utf8') + os.linesep)
 118
 119
 120 def translate_symbol(unicodesymbols, symbol, try_combining = True):
 121     " Translate a symbol from LaTeX to unicode."
 122     re_combining = re.compile(r'^[^a-zA-Z]')
 123     if len(symbol) == 1:
 124         return symbol
 125     for i in unicodesymbols:
 126         # Play safe and don't try combining symbols (not sure if this is
 127         # needed)
 128         if i[1] == symbol and (len(i) < 4 or i[3].find('combining') < 0):
 129             return unichr(i[0])
 130     if try_combining:
 131         # no direct match, see whether this is a combining sequence
 132         for i in unicodesymbols:
 133             if len(i) > 3 and i[3].find('combining') >= 0 and symbol.find(i[1]) == 0:
 134                 # Test whether this is really a combining sequence, e.g.
 135                 # \"o or \d{o}, and not a symbol like \dh that shares the
 136                 # beginning with a combining symbol
 137                 translated = symbol[len(i[1]):]
 138                 if translated != '' and re_combining.match(translated):
 139                     # Really a combining sequence
 140                     if len(translated) > 1 and translated[0] == '{' and translated[-1] == '}':
 141                         # Strip braces from things like \d{o}
 142                         translated = translated[1:-1]
 143                     else:
 144                         # for some strange reason \\'\\i does not get
 145                         # correctly combined, so we try \\'\\i which has an
 146                         # entry in unicodesymbols
 147                         combined = translate_symbol(unicodesymbols, u'%s{%s}' % (i[1], translated))
 148                         if combined != '':
 149                             return combined
 150                     if len(translated) > 1:
 151                         # The base character may be a symbol itself, e.g \"{\i}
 152                         translated = translate_symbol(unicodesymbols, translated, False)
 153                     # Play safe and only translate combining sequences with
 154                     # one base character
 155                     if len(translated) == 1 and (i[1] != '\\q' or translated in ['t', 'd', 'l', 'L']):
 156                         return unicodedata.normalize("NFKC", translated + unichr(i[0]))
 157                     else:
 158                         # we founed a combining character, but could not convert the argument to a single character
 159                         return ''
 160     return ''
 161
 162
 163 def convert(lines, unicodesymbols):
 164     " Translate all symbols in lines from LaTeX to unicode."
 165     # convert both commented and active entries
 166     re_kmap = re.compile(r'^(#?\s*\\kmap\s+\S+\s+)([^\s]+)(.*)$')
 167     re_kxmod = re.compile(r'^(#?\s*\\kxmod\s+\S+\s+\S+\s+)([^\s]+)(.*)$')
 168     for i in range(len(lines)):
 169         match = re_kmap.match(lines[i])
 170         if not match:
 171             match = re_kxmod.match(lines[i])
 172         if match:
 173             symbol = unescape(match.group(2))
 174             if len(symbol) > 2 and symbol[-2:] == '{}':
 175                 # The unicodesymbols file does not include the trailing delimiter {}
 176                 symbol = symbol[0:-2]
 177             translated = translate_symbol(unicodesymbols, symbol)
 178             if translated == '':
 179                 lines[i] = u'%s%s%s' % (match.group(1), match.group(2), match.group(3))
 180             else:
 181                 lines[i] = u'%s%s%s' % (match.group(1), escape(translated), match.group(3))
 182                 continue
 183
 184
 185 def main(argv):
 186
 187     # Open files
 188     if len(argv) == 2:
 189         input = sys.stdin
 190         output = sys.stdout
 191     elif len(argv) == 4:
 192         input = open(argv[2], 'rb')
 193         output = open(argv[3], 'wb')
 194     else:
 195         error(usage(argv[0]))
 196     unicodesymbols = open(argv[1], 'rb')
 197
 198     # Do the real work
 199     symbols = readsymbols(unicodesymbols)
 200     lines = read(input)
 201     convert(lines, symbols)
 202     write(output, lines)
 203
 204     # Close files
 205     if len(argv) == 3:
 206         input.close()
 207         output.close()
 208
 209     return 0
 210
 211
 212 if __name__ == "__main__":
 213     main(sys.argv)