contrib/devtools/update-translations.py

   1 #!/usr/bin/env python
   2 # Copyright (c) 2014 Wladimir J. van der Laan
   3 # Distributed under the MIT software license, see the accompanying
   4 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
   5 '''
   6 Run this script from the root of the repository to update all translations from
   7 transifex.
   8 It will do the following automatically:
   9
  10 - fetch all translations using the tx tool
  11 - post-process them into valid and committable format
  12   - remove invalid control characters
  13   - remove location tags (makes diffs less noisy)
  14
  15 TODO:
  16 - auto-add new translations to the build system according to the translation process
  17 '''
  18 from __future__ import division, print_function
  19 import subprocess
  20 import re
  21 import sys
  22 import os
  23 import io
  24 import xml.etree.ElementTree as ET
  25
  26 # Name of transifex tool
  27 TX = 'tx'
  28 # Name of source language file
  29 SOURCE_LANG = 'bitcoin_en.ts'
  30 # Directory with locale files
  31 LOCALE_DIR = 'src/qt/locale'
  32 # Minimum number of messages for translation to be considered at all
  33 MIN_NUM_MESSAGES = 10
  34
  35 def check_at_repository_root():
  36     if not os.path.exists('.git'):
  37         print('No .git directory found')
  38         print('Execute this script at the root of the repository', file=sys.stderr)
  39         sys.exit(1)
  40
  41 def fetch_all_translations():
  42     if subprocess.call([TX, 'pull', '-f', '-a']):
  43         print('Error while fetching translations', file=sys.stderr)
  44         sys.exit(1)
  45
  46 def find_format_specifiers(s):
  47     '''Find all format specifiers in a string.'''
  48     pos = 0
  49     specifiers = []
  50     while True:
  51         percent = s.find('%', pos)
  52         if percent < 0:
  53             break
  54         specifiers.append(s[percent+1])
  55         pos = percent+2
  56     return specifiers
  57
  58 def split_format_specifiers(specifiers):
  59     '''Split format specifiers between numeric (Qt) and others (strprintf)'''
  60     numeric = []
  61     other = []
  62     for s in specifiers:
  63         if s in {'1','2','3','4','5','6','7','8','9'}:
  64             numeric.append(s)
  65         else:
  66             other.append(s)
  67
  68     # If both numeric format specifiers and "others" are used, assume we're dealing
  69     # with a Qt-formatted message. In the case of Qt formatting (see https://doc.qt.io/qt-5/qstring.html#arg)
  70     # only numeric formats are replaced at all. This means "(percentage: %1%)" is valid, without needing
  71     # any kind of escaping that would be necessary for strprintf. Without this, this function
  72     # would wrongly detect '%)' as a printf format specifier.
  73     if numeric:
  74         other = []
  75
  76     # numeric (Qt) can be present in any order, others (strprintf) must be in specified order
  77     return set(numeric),other
  78
  79 def sanitize_string(s):
  80     '''Sanitize string for printing'''
  81     return s.replace('\n',' ')
  82
  83 def check_format_specifiers(source, translation, errors, numerus):
  84     source_f = split_format_specifiers(find_format_specifiers(source))
  85     # assert that no source messages contain both Qt and strprintf format specifiers
  86     # if this fails, go change the source as this is hacky and confusing!
  87     assert(not(source_f[0] and source_f[1]))
  88     try:
  89         translation_f = split_format_specifiers(find_format_specifiers(translation))
  90     except IndexError:
  91         errors.append("Parse error in translation for '%s': '%s'" % (sanitize_string(source), sanitize_string(translation)))
  92         return False
  93     else:
  94         if source_f != translation_f:
  95             if numerus and source_f == (set(), ['n']) and translation_f == (set(), []) and translation.find('%') == -1:
  96                 # Allow numerus translations to omit %n specifier (usually when it only has one possible value)
  97                 return True
  98             errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation)))
  99             return False
 100     return True
 101
 102 def all_ts_files(suffix=''):
 103     for filename in os.listdir(LOCALE_DIR):
 104         # process only language files, and do not process source language
 105         if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix:
 106             continue
 107         if suffix: # remove provided suffix
 108             filename = filename[0:-len(suffix)]
 109         filepath = os.path.join(LOCALE_DIR, filename)
 110         yield(filename, filepath)
 111
 112 FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]')
 113 def remove_invalid_characters(s):
 114     '''Remove invalid characters from translation string'''
 115     return FIX_RE.sub(b'', s)
 116
 117 # Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for
 118 # comparison, disable by default)
 119 _orig_escape_cdata = None
 120 def escape_cdata(text):
 121     text = _orig_escape_cdata(text)
 122     text = text.replace("'", '&apos;')
 123     text = text.replace('"', '&quot;')
 124     return text
 125
 126 def postprocess_translations(reduce_diff_hacks=False):
 127     print('Checking and postprocessing...')
 128
 129     if reduce_diff_hacks:
 130         global _orig_escape_cdata
 131         _orig_escape_cdata = ET._escape_cdata
 132         ET._escape_cdata = escape_cdata
 133
 134     for (filename,filepath) in all_ts_files():
 135         os.rename(filepath, filepath+'.orig')
 136
 137     have_errors = False
 138     for (filename,filepath) in all_ts_files('.orig'):
 139         # pre-fixups to cope with transifex output
 140         parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8'
 141         with open(filepath + '.orig', 'rb') as f:
 142             data = f.read()
 143         # remove control characters; this must be done over the entire file otherwise the XML parser will fail
 144         data = remove_invalid_characters(data)
 145         tree = ET.parse(io.BytesIO(data), parser=parser)
 146
 147         # iterate over all messages in file
 148         root = tree.getroot()
 149         for context in root.findall('context'):
 150             for message in context.findall('message'):
 151                 numerus = message.get('numerus') == 'yes'
 152                 source = message.find('source').text
 153                 translation_node = message.find('translation')
 154                 # pick all numerusforms
 155                 if numerus:
 156                     translations = [i.text for i in translation_node.findall('numerusform')]
 157                 else:
 158                     translations = [translation_node.text]
 159
 160                 for translation in translations:
 161                     if translation is None:
 162                         continue
 163                     errors = []
 164                     valid = check_format_specifiers(source, translation, errors, numerus)
 165
 166                     for error in errors:
 167                         print('%s: %s' % (filename, error))
 168
 169                     if not valid: # set type to unfinished and clear string if invalid
 170                         translation_node.clear()
 171                         translation_node.set('type', 'unfinished')
 172                         have_errors = True
 173
 174                 # Remove location tags
 175                 for location in message.findall('location'):
 176                     message.remove(location)
 177
 178                 # Remove entire message if it is an unfinished translation
 179                 if translation_node.get('type') == 'unfinished':
 180                     context.remove(message)
 181
 182         # check if document is (virtually) empty, and remove it if so
 183         num_messages = 0
 184         for context in root.findall('context'):
 185             for message in context.findall('message'):
 186                 num_messages += 1
 187         if num_messages < MIN_NUM_MESSAGES:
 188             print('Removing %s, as it contains only %i messages' % (filepath, num_messages))
 189             continue
 190
 191         # write fixed-up tree
 192         # if diff reduction requested, replace some XML to 'sanitize' to qt formatting
 193         if reduce_diff_hacks:
 194             out = io.BytesIO()
 195             tree.write(out, encoding='utf-8')
 196             out = out.getvalue()
 197             out = out.replace(b' />', b'/>')
 198             with open(filepath, 'wb') as f:
 199                 f.write(out)
 200         else:
 201             tree.write(filepath, encoding='utf-8')
 202     return have_errors
 203
 204 if __name__ == '__main__':
 205     check_at_repository_root()
 206     fetch_all_translations()
 207     postprocess_translations()
 208