fix proxy support
[straw.git] / tools / msgmerge.py
blob11fec60a7eb40f588e0857181f9ccc3079ab8ce8
1 #! /usr/bin/env python
2 # -*- coding: iso-8859-1 -*-
4 # Copyright Terje Røsten <terjeros@phys.ntnu.no> Nov. 2003.
5 #
6 '''Merge two Uniforum style .po files together.
8 This is a implementation (not complete) in Python of the GNU
9 msgmerge(1) program. It can be used on the command line (or as a Python
10 module).
12 Usage: msgmerge.py [OPTIONS] def.po ref.pot
14 The def.po file is an existing PO file with translations. The ref.pot
15 file is the last created PO file with up-to-date source references but
16 old translations, or a PO Template file.
18 Options:
19 -U, --update update def.po,
20 do nothing if def.po is already up to date.
21 -o, --output-file=FILE write output to file FILE. Output is written to
22 stdout if set to - or if the option is not present.
23 -D, --docstrings don\'t remove docstring flag.
24 -h, --help display help text and exit.
25 -V, --version display version and exit.
26 -q, --quiet, --silent suppress progress indicators.
27 '''
28 from __future__ import generators
30 if not __name__ == '__main__':
31 __doc__ += '''\
33 When used as module the interesting functions are merge() and
34 merge_dir().
36 The merge() function does the same as the command line version, and
37 the arguments are as follows. The first argument is the def.po file,
38 then the ref.pot file. The third argument controls whether do work in
39 update mode or not, then the next argument sets the output file. Set
40 the next argument to False to remove docstring flags. The last
41 argument can be used to suppress progress indicators. The default is
42 to work in update mode with progress indicators.
44 Example:
45 merge("def.po", "ref.pot")
46 merge the files def.po and ref.pot and write output to def.po if
47 there are any changes.
48 merge("def.po", "red.pot", docstrings = False, verbose = False,
49 update = False, outfile = "-")
50 merge the files def.po and ref.pot and write output to stdout,
51 remove docstring flag and be quiet.
53 The merge_dir() function is useful when merging a directory of po
54 files. The only required argument is the name of the directory with po
55 files and the pot file. It will use simple glob to find the files. The
56 second argument can be used to specify the pot file (in the
57 directory). Third argument is a list of po files (then globbing will
58 not be used) and the next argument is list of filename to exclude. The
59 last argument can be used to suppress progress indicators. Docstring
60 flag will not be removed.
62 Example:
63 merge_dir("po")
64 merge (and update) all po files in directory po with the single pot
65 file in the same directory.
67 The module raises the MsgmergeError exception in case of error.
68 '''
69 __version__ = '0.1'
70 name = 'msgmerge.py'
72 __all__ = [ 'merge', 'merge_dir', 'MsgmergeError' ]
74 import sys
75 import re
76 import string
77 import getopt
78 import difflib
79 import glob
80 import os.path
82 try:
83 True, False
84 except NameError:
85 True, False = 1, 0
87 class Msgs:
88 '''Class to hold information about messages.'''
89 width = 80
90 file = ''
91 def __init__(self, msgid, msgstr, flag, lno, entry, **kwds):
92 self.id = msgid
93 self.str = msgstr
94 self.cmt = kwds.get('cmt', '')
95 self.ref = kwds.get('ref', '')
96 self.autocmt = kwds.get('autocmt', '')
97 self.flag = flag
98 self.entry = entry
99 self.lno = lno
100 self.count = 0
101 def wash(self):
102 self.id = wash(self.id, width = self.width,
103 filename = self.file, lno = self.lno)
104 self.str = wash(self.str, 'msgstr', width = self.width,
105 filename = self.file, lno = self.lno)
106 def used(self):
107 self.count += 1
108 def get_clean_id(self):
109 return self.id.replace('msgid "','', 1)
110 def obsolete(self):
111 self.width -= len('#~ ')
112 self.wash()
113 t = [ '#~ %s\n' % s for s in self.id.splitlines() ]
114 self.id = ''.join(t)
115 t = [ '#~ %s\n' % s for s in self.str.splitlines() ]
116 self.str = ''.join(t)
118 class Options:
119 '''Class to hold options'''
120 def __init__(self, cmdline = False, **kwds):
121 if not cmdline:
122 self.update = kwds.get('update', True)
123 self.outfile = kwds.get('outfile', '-')
124 self.docstrings = kwds.get('docstrings', True)
125 self.verbose = kwds.get('verbose', False)
126 else:
127 self.update = False
128 self.outfile = False
129 self.docstrings = False
130 self.verbose = True
132 class MsgmergeError(Exception):
133 '''Exception class for msgmerge'''
134 def __init__(self, args):
135 self.args = args
137 def gen(lines):
139 Generator which returns a line (with the obsolete prefix removed)
140 from the list of lines in <lines>, the line number is also
141 returned.
143 lno = 0
144 for l in lines:
145 lno += 1
146 yield l.replace('#~ ', '', 1), lno
147 yield l, lno
149 def slurp(s, g, sign):
151 The string returned from iterator <g>\'s next() method is added to
152 the string <s> if string returned is beginning with the string
153 <sign>. The return value is the first returned string which do not
154 start with <sign>, the line number, the iterator <g> and the
155 (possibly) updated string <s>.
157 l, lno = g.next()
158 while l.startswith(sign) or (sign == '# ' and l.strip() == '#'):
159 s += l
160 l, lno = g.next()
161 return l, lno, g, s
163 def splitted_fit(chunk, line, width, break_always, break_after_space):
165 Check if string <chunk> can be splitted by newline to fit into
166 string <line> with width smaller than <width>. The return value is
167 a tuple where the first element is the part of chunk which fits
168 and the second element is the rest of chunk.
170 ret = '', chunk
171 l = len(chunk)
172 for i in range(l - 1, -1, -1):
173 if chunk[i] in break_always and len(chunk[0:i] + line) <= width:
174 ret = chunk[0:i], chunk[i:]
175 break
176 elif chunk[i] in break_after_space and i and chunk[i-1].strip() == '':
177 ret = chunk[0:i], chunk[i:]
178 break
179 elif chunk[i] == '\\' and len(chunk[i:]) > 1 and chunk[i+1] == '"' \
180 and len(chunk[0:i] + line) <= width:
181 ret = chunk[0:i], chunk[i:]
182 break
183 return ret
185 def wrap(msg, width):
187 Accept a list <msg> of strings to wrap, each string is wrapped to
188 width <width> and surrounded with a pair of ". The return value is
189 a string with these wrapped strings joined together with newlines.
191 if msg.isspace() or not msg:
192 return '"%s"' % msg
194 # \ and " is here, but " is special in po files.
195 break_always = '$%+({['
196 # XXX what about: « © » ¦ § etc?
197 break_after_space = '_-=^`~\'<|>&*#@'
198 enders = '.:,;!?/])}|%-'
199 extra = string.punctuation
200 for c in enders:
201 extra = extra.replace(c, '')
202 escaped = { 'enders' : re.escape(enders),
203 'extra' : re.escape(extra) }
204 regex = r'([\w%(extra)s]*[\s%(enders)s)]+[\s%(enders)s]*)' % escaped
205 r = re.compile(regex, re.UNICODE)
206 msg = [ m for m in r.split(msg) if not m == '']
208 lines = []
209 line = msg.pop(0)
211 # Handle \n on end of line
212 if len(msg) > 1 and msg[-1] == 'n' and len(msg[-2]) > 0 \
213 and msg[-2][-1] == '\\':
214 msg[-2] += msg[-1]
215 msg.pop()
216 # Do not allow a single \n on a line
217 if len(msg) > 2 and msg[-1] == '\\n':
218 msg[-2] += msg[-1]
219 msg.pop()
221 for m in msg:
222 if len(line) > width or len(m) > width or len(line + m) > width:
223 fit, rest = splitted_fit(m, line, width, break_always,
224 break_after_space)
225 line += fit
226 lines.append(line)
227 line = rest
228 else:
229 line += m
230 lines.append(line)
231 lines = [ '"%s"' % l for l in lines ]
232 return '\n'.join(lines)
234 def normalize(lines):
236 Normalize <lines>: e.g "\n\nText\n\n" becomes:
237 "\n"
238 "\n"
239 "Text\n"
240 "\n"
242 if 0 < lines.find('\\n') < len(lines) - 3:
243 if lines[-3:] == '\\n"':
244 lines = lines[:-3].replace('\\n','\\n"\n"').replace('""\n','') \
245 + '\\n"'
246 else:
247 lines = lines.replace('\\n','\\n"\n"').replace('""\n','')
248 return lines
250 def wash(msg, idx = 'msgid', width = 80, **kwds):
252 Do washing on the msgstr or msgid fields. Wrap the text to fit in
253 width <width>. <msg> is a list of lines that makes up the field.
254 <idx> indicate msgid or msgstr, <width> holds the width. <filename>
255 and <lno> (line number) is picked up from <kwds>.
256 Returns the washed field as a string.
258 msg = normalize(msg)
259 lines = msg.splitlines()
260 size = len(lines)
261 if size > 1 or len(msg) > width:
262 washed = []
263 # The first line is special
264 m = re.match('^%s "(.*)"$' % (idx, ), lines[0])
265 if not m:
266 print lines[0]
267 kwds['lno'] -= size + 1
268 raise MsgmergeError('parse error: %(filename)s:%(lno)s.'
269 % kwds)
270 washed.append(m.group(1))
271 if m.group(1).endswith(r'\n'):
272 washed.append('')
273 i = 0
274 for line in lines[1:]:
275 m = re.match('^"(\s*.*)"$', line)
276 i += 1
277 if not m:
278 print line
279 kwds['lno'] -= size - i + 1
280 raise MsgmergeError('parse error: %(filename)s:%(lno)s.'
281 % kwds)
282 washed[-1] += m.group(1)
283 if m.group(1).endswith(r'\n'):
284 washed.append('')
285 if washed[0] == '':
286 washed.pop(0)
287 if washed[-1] == '':
288 washed.pop()
290 washed = [ wrap(w, width - 3) for w in washed ] # " and \n removed.
292 # One line or multiline
293 if len(washed) == 1 and len('%s %s\n' % (idx, washed[0])) < width:
294 washed = '%s %s\n' % (idx, washed[0])
295 else:
296 washed = '%s ""\n%s\n' % (idx, '\n'.join(washed))
297 else:
298 washed = msg
300 return washed
302 def parse(filename, entry):
304 Parse po or pot file with name <filename>. Set the variable
305 <entry> to msgid/msgstr to indicate pot/po file. The return value
306 is a dict with msgid (washed) as key and Msgs instances as
307 values.
309 lines = io(filename).readlines()
310 Msgs.file = filename
311 messages = {}
312 last = len(lines)
313 g = gen(lines)
314 cmt = autocmt = ref = flag = ''
315 msgid = False
316 lno = 0
317 while not lno == last:
318 l, lno = g.next()
319 if l.startswith('# '):
320 l, lno, g, cmt = slurp(l, g, '# ')
321 if l.startswith('#.'):
322 l, lno, g, autocmt = slurp(l, g, '#.')
323 if l.startswith('#:'):
324 l, lno, g, ref = slurp(l, g, '#:')
325 if l.startswith('#,'):
326 l, lno, g, flag = slurp(l, g, '#,')
327 if l.startswith('msgid'):
328 l, lno, g, msgid = slurp(l, g, '"')
329 if l.startswith('msgstr'):
330 l, lno, g, msgstr = slurp(l, g, '"')
332 if not lno == last and not l.strip() == '':
333 raise MsgmergeError('parse error: %s:%s.' % (filename, lno))
335 if msgid and entry == 'msgstr':
336 idx = wash(msgid, filename = filename, lno = lno)
337 messages[idx] = Msgs(msgid, msgstr, flag, lno, entry, cmt = cmt)
338 msgid = False; msgstr = cmt = autocmt = ref = flag = ''
339 elif msgid and entry == 'msgid':
340 idx = wash(msgid, filename = filename, lno = lno)
341 messages[idx] = Msgs(msgid, msgstr, flag, lno, entry,
342 autocmt = autocmt, ref = ref)
343 msgid = False; msgstr = cmt = autocmt = ref = flag = ''
345 for m in messages.values():
346 m.wash()
347 return messages
349 def fuzzy_match(pot, defs):
351 Try to find the best difflib match (with ratio > 0.6) between
352 id of Msgs object <pot> and Msgs in the dict <defs>.
353 Return value is the Msgs object in <defs> with highest ratio,
354 False is returned if no suitable Msgs is found.
356 limit = 0.6
357 l, po = limit - 0.01, False
358 s = difflib.SequenceMatcher(lambda x: x == ' "', '', pot.get_clean_id())
359 len2 = len(pot.get_clean_id())
360 for candidate in defs.values():
361 if candidate.str == 'msgstr ""\n': # Empty translation
362 continue
363 if candidate.id == 'msgid ""\n': # Empty msgid (header)
364 continue
365 len1 = len(candidate.get_clean_id())
366 if len2 > 2 * len1 or len1 > 1.5 * len2: # Simple and fast tests first
367 continue
368 s.set_seq1(candidate.get_clean_id())
369 if s.quick_ratio() < l:
370 continue
371 r = s.ratio() # This is expensive
372 if r > l:
373 l, po = r, candidate
374 return po
376 def flags(po, pot, fuzzy = False, obs = False):
378 Create flag field from flag field in Msgs objects <po> and
379 <pot>. When <fuzzy> is true <po>\'s flags are ignored and the
380 fuzzy flag is added. If <obs> is set then most flags but fuzzy are
381 removed. If the global variable option.docstrings is set then
382 docstring flags will not be removed. The return value is a string
383 which holds the combined flag.
385 global option
386 flag = ''
387 if po.flag or pot.flag or fuzzy:
388 if not fuzzy:
389 flag = '%s, %s' % (po.flag.strip(), pot.flag.strip())
390 else:
391 flag = '%s, %s' % ('#, fuzzy', pot.flag.strip())
392 flag = flag.split(', ')
393 fl = {}
394 flag = [fl.setdefault(f, f) for f in flag if f not in fl and f]
395 if not option.docstrings:
396 try:
397 flag.remove('docstring')
398 except ValueError:
399 pass
400 if obs:
401 removes = ['c-format', 'python-format', 'docstring']
402 for remove in removes:
403 try:
404 flag.remove(remove)
405 except ValueError:
406 pass
407 # Put fuzzy first
408 if 'fuzzy' in flag and not flag.index('fuzzy') == 1:
409 i = flag.index('fuzzy')
410 flag[1], flag[i] = flag[i], flag[1]
412 if len(flag) == 1:
413 flag = ''
414 else:
415 flag = ', '.join(flag) + '\n'
416 return flag
418 def add(pot, po, fuzzy = False):
420 Build a new entry from the Msgs objects <pot> and <pot>. If
421 <fuzzy> is true, <po>\'s flag field is ignored (in
422 flags()). Returns a multiline string with a up to date entry.
424 msg = []
425 msg.append(po.cmt)
426 msg.append(pot.autocmt)
427 msg.append(pot.ref)
428 msg.append(flags(po, pot, fuzzy = fuzzy))
429 msg.append(pot.id)
430 msg.append(po.str)
431 return ''.join(msg)
433 def header(pot, defs):
435 Update date in header entry. Returns the updated header entry.
437 try:
438 [po] = [ d for d in defs.values() if d.id == 'msgid ""\n' ]
439 except ValueError:
440 raise MsgmergeError('Error: did not find header in po file.')
442 r = re.compile(r'(.*^"POT-Creation-Date:\s+)(.*?)(\\n"$.*)',
443 re.MULTILINE | re.DOTALL)
444 m = r.match(pot.str)
445 if not m:
446 raise MsgmergeError(
447 'Error: did not find POT-Creation-Date field in pot file.')
449 subs = '\\1%s\\3' % m.group(2)
450 po.str, count = r.subn(subs, po.str)
451 if not count == 1:
452 raise MsgmergeError(
453 'Error: did not find POT-Creation-Date field in po file.')
454 return po
456 def match(defs, refs):
458 Try to match Msgs objects in <refs> with Msgs objects in
459 <defs>. The return value is a list with po entries.
461 global option
462 matches = []
463 empty = Msgs('msgid ""\n', 'msgstr ""\n', '', -1, 'str')
464 deco = [(r.lno, r) for r in refs.values() ]
465 deco.sort()
466 po = header(deco.pop(0)[1], defs) # Header entry
467 matches.append(add(empty, po))
468 po.used()
469 sorted = [ a[1] for a in deco ]
470 for pot in sorted:
471 if option.verbose:
472 sys.stderr.write('.')
473 po = defs.get(pot.id, False) # Perfect match
474 if po:
475 matches.append(add(pot, po))
476 po.used(); pot.used()
477 continue
478 po = fuzzy_match(pot, defs) # Fuzzy match
479 if po:
480 matches.append(add(pot, po, fuzzy = True))
481 po.used(); pot.used()
482 continue
483 matches.append(add(pot, empty)) # No match
485 obsolete(defs, matches)
486 return matches
488 def obsolete(defs, matches):
489 '''Handle obsolete translations.'''
490 deco = [ (d.lno, d) for d in defs.values() if
491 d.count == 0 and not d.str == 'msgstr ""\n' ]
492 deco.sort()
493 empty = Msgs('msgid ""\n', 'msgstr ""\n', '', -1, 'str')
494 obs = [ o[1] for o in deco ]
495 for o in obs:
496 o.flag = flags(o, empty, obs = True)
497 o.obsolete()
498 matches.append('%s%s%s' % (o.flag, o.id, o.str))
500 def help():
501 '''Print help text and exit.'''
502 print __doc__
503 sys.exit(0)
505 def cmdline():
506 '''Parse options and arguments from command line.'''
507 advice = 'Try `%(name)s --help\' for more information.'
508 try:
509 long_opt = ['help', 'version', 'update', 'output-file=',
510 'quiet', 'silent', 'docstrings']
511 opts, args = getopt.getopt(sys.argv[1:], 'hVUo:qD', long_opt)
512 except getopt.error, msg:
513 print '%s: %s\n%s' % ('%(name)s', msg, advice) % globals()
514 sys.exit(1)
516 option = Options(cmdline = True)
517 for opt, arg in opts:
518 if opt in ['-h', '--help']:
519 help()
520 elif opt in ['-V', '--version']:
521 print '%(name)s %(__version__)s' % globals()
522 sys.exit(0)
523 elif opt in ['-o', '--output-file']:
524 option.outfile = arg
525 elif opt in ['-U', '--update']:
526 option.update = True
527 elif opt in ['-q', '--silent', '--quiet']:
528 option.verbose = False
529 elif opt in ['-D', '--docstrings']:
530 option.docstrings = True
532 # Sanity checks
533 warn = False
534 if option.update and option.outfile:
535 warn = '--update and --output-file are mutually exclusive.'
536 if len(args) == 0:
537 warn = 'no input files given.'
538 elif len(args) == 1 or len(args) > 2:
539 warn = 'exactly 2 input files required.'
540 if warn:
541 print '%s: %s\n%s' % ('%(name)s', warn, advice) % globals()
542 sys.exit(1)
544 if option.update:
545 option.outfile = args[0]
546 elif not option.outfile:
547 option.outfile = '-'
549 defs, refs = args
551 try:
552 merge(defs, refs, option = option)
553 except MsgmergeError, err:
554 print '%(name)s: ' % globals() + '%s' % err
555 sys.exit(1)
557 def io(iofile, mode = 'r'):
558 '''Wrapper around open().'''
559 try:
560 fd = open(iofile, mode)
561 except IOError, msg:
562 raise MsgmergeError('error while opening file: %s: %s.' %
563 (msg[1], iofile))
564 return fd
566 def changes(new, old):
567 return cmp(''.join(old), '\n'.join(new))
569 def write(matches, outfile):
570 '''Write the list <matches> to file <outfile>'''
571 if not outfile == '-':
572 fd = io(outfile, 'w')
573 else:
574 fd = sys.stdout
575 fd.write('\n'.join(matches))
577 def merge(def_file, ref_file, update = True, outfile = '-',
578 docstrings = True, verbose = True, **kwds):
580 Merge po file <def_file> with pot file <ref_file> . If <update> is
581 set to True then only update if there are changes to the po
582 file. Set outfile to write updated po file to an another file. Set
583 to `-\' for writing to standard out. If docstrings is False
584 docstrings flag will removed. Set verbose to False to suppress
585 progress indicators. <kwds> is used to pass options from the
586 command line interface.
588 global option
589 option = kwds.get('option', Options(update = update,
590 outfile = outfile,
591 docstrings = docstrings,
592 verbose = verbose))
593 def_msgs = parse(def_file, 'msgstr')
594 ref_msgs = parse(ref_file, 'msgid')
595 if verbose and not __name__ == '__main__':
596 print >> sys.stderr, 'Merging %s with %s' % (ref_file, def_file)
597 updated_lines = match(def_msgs, ref_msgs)
598 if option.verbose:
599 print >> sys.stderr, ' done.'
600 if not option.update:
601 write(updated_lines, option.outfile)
602 elif option.update and changes(updated_lines, io(def_file).readlines()):
603 write(updated_lines, def_file)
605 def merge_dir(directory, pot = False, include = [], exclude = [],
606 verbose = True):
608 Tries to merge a directory of po files. Uses simple glob to find
609 po files and pot file. The parameter <pot> can be used to specify
610 the pot file in the directory. If the list <include> is given only
611 files in this list is merged. Use the list <exclude> to exclude
612 files to be merged. This function is only useful if po files and
613 pot file are in the same directory. Set <verbose> to get
614 information when running.
616 if directory[-1] == '/':
617 directory = os.path.dirname(directory)
618 if pot:
619 pot = os.path.basename(pot)
620 else:
621 pot = glob.glob('%s/*.pot' % directory)
622 if not pot:
623 raise MsgmergeError('No pot file found.')
624 elif len(pot) > 1:
625 raise MsgmergeError('More than one pot file found: %s.' % pot)
626 pot = os.path.basename(pot[0])
628 if not include:
629 pos = glob.glob('%s/*po' % directory)
630 if not len(pos) > 1:
631 raise MsgmergeError('No po file(s) found.')
632 pos = [ os.path.basename(po) for po in pos ]
633 else:
634 pos = [ os.path.basename(po) for po in include ]
636 for po in exclude:
637 try:
638 pos.remove(po)
639 except ValueError:
640 pass
641 format = '%s/%s'
642 for po in pos:
643 try:
644 merge(format % (directory, po), format % (directory, pot),
645 update = True, verbose = verbose,
646 outfile = format % (directory, po))
647 except MsgmergeError, err:
648 if verbose:
649 print >> sys.stderr, '%s Not updated.' % err
650 else:
651 print >> sys.stderr, '%s %s not updated.' % (err, po)
653 if __name__ == '__main__':
654 cmdline()