2 # -*- coding: iso-8859-1 -*-
4 # Copyright Terje Røsten <terjeros@phys.ntnu.no> Nov. 2003.
6 '''Merge two Uniforum style .po files together.
8 This is a implementation (not complete) in Python of the GNU
9 msgmerge(1) program. It can be used on the command line (or as a Python
12 Usage: msgmerge.py [OPTIONS] def.po ref.pot
14 The def.po file is an existing PO file with translations. The ref.pot
15 file is the last created PO file with up-to-date source references but
16 old translations, or a PO Template file.
19 -U, --update update def.po,
20 do nothing if def.po is already up to date.
21 -o, --output-file=FILE write output to file FILE. Output is written to
22 stdout if set to - or if the option is not present.
23 -D, --docstrings don\'t remove docstring flag.
24 -h, --help display help text and exit.
25 -V, --version display version and exit.
26 -q, --quiet, --silent suppress progress indicators.
28 from __future__
import generators
30 if not __name__
== '__main__':
33 When used as module the interesting functions are merge() and
36 The merge() function does the same as the command line version, and
37 the arguments are as follows. The first argument is the def.po file,
38 then the ref.pot file. The third argument controls whether do work in
39 update mode or not, then the next argument sets the output file. Set
40 the next argument to False to remove docstring flags. The last
41 argument can be used to suppress progress indicators. The default is
42 to work in update mode with progress indicators.
45 merge("def.po", "ref.pot")
46 merge the files def.po and ref.pot and write output to def.po if
47 there are any changes.
48 merge("def.po", "red.pot", docstrings = False, verbose = False,
49 update = False, outfile = "-")
50 merge the files def.po and ref.pot and write output to stdout,
51 remove docstring flag and be quiet.
53 The merge_dir() function is useful when merging a directory of po
54 files. The only required argument is the name of the directory with po
55 files and the pot file. It will use simple glob to find the files. The
56 second argument can be used to specify the pot file (in the
57 directory). Third argument is a list of po files (then globbing will
58 not be used) and the next argument is list of filename to exclude. The
59 last argument can be used to suppress progress indicators. Docstring
60 flag will not be removed.
64 merge (and update) all po files in directory po with the single pot
65 file in the same directory.
67 The module raises the MsgmergeError exception in case of error.
72 __all__
= [ 'merge', 'merge_dir', 'MsgmergeError' ]
88 '''Class to hold information about messages.'''
91 def __init__(self
, msgid
, msgstr
, flag
, lno
, entry
, **kwds
):
94 self
.cmt
= kwds
.get('cmt', '')
95 self
.ref
= kwds
.get('ref', '')
96 self
.autocmt
= kwds
.get('autocmt', '')
102 self
.id = wash(self
.id, width
= self
.width
,
103 filename
= self
.file, lno
= self
.lno
)
104 self
.str = wash(self
.str, 'msgstr', width
= self
.width
,
105 filename
= self
.file, lno
= self
.lno
)
108 def get_clean_id(self
):
109 return self
.id.replace('msgid "','', 1)
111 self
.width
-= len('#~ ')
113 t
= [ '#~ %s\n' % s
for s
in self
.id.splitlines() ]
115 t
= [ '#~ %s\n' % s
for s
in self
.str.splitlines() ]
116 self
.str = ''.join(t
)
119 '''Class to hold options'''
120 def __init__(self
, cmdline
= False, **kwds
):
122 self
.update
= kwds
.get('update', True)
123 self
.outfile
= kwds
.get('outfile', '-')
124 self
.docstrings
= kwds
.get('docstrings', True)
125 self
.verbose
= kwds
.get('verbose', False)
129 self
.docstrings
= False
132 class MsgmergeError(Exception):
133 '''Exception class for msgmerge'''
134 def __init__(self
, args
):
139 Generator which returns a line (with the obsolete prefix removed)
140 from the list of lines in <lines>, the line number is also
146 yield l
.replace('#~ ', '', 1), lno
149 def slurp(s
, g
, sign
):
151 The string returned from iterator <g>\'s next() method is added to
152 the string <s> if string returned is beginning with the string
153 <sign>. The return value is the first returned string which do not
154 start with <sign>, the line number, the iterator <g> and the
155 (possibly) updated string <s>.
158 while l
.startswith(sign
) or (sign
== '# ' and l
.strip() == '#'):
163 def splitted_fit(chunk
, line
, width
, break_always
, break_after_space
):
165 Check if string <chunk> can be splitted by newline to fit into
166 string <line> with width smaller than <width>. The return value is
167 a tuple where the first element is the part of chunk which fits
168 and the second element is the rest of chunk.
172 for i
in range(l
- 1, -1, -1):
173 if chunk
[i
] in break_always
and len(chunk
[0:i
] + line
) <= width
:
174 ret
= chunk
[0:i
], chunk
[i
:]
176 elif chunk
[i
] in break_after_space
and i
and chunk
[i
-1].strip() == '':
177 ret
= chunk
[0:i
], chunk
[i
:]
179 elif chunk
[i
] == '\\' and len(chunk
[i
:]) > 1 and chunk
[i
+1] == '"' \
180 and len(chunk
[0:i
] + line
) <= width
:
181 ret
= chunk
[0:i
], chunk
[i
:]
185 def wrap(msg
, width
):
187 Accept a list <msg> of strings to wrap, each string is wrapped to
188 width <width> and surrounded with a pair of ". The return value is
189 a string with these wrapped strings joined together with newlines.
191 if msg
.isspace() or not msg
:
194 # \ and " is here, but " is special in po files.
195 break_always
= '$%+({['
196 # XXX what about: « © » ¦ § etc?
197 break_after_space
= '_-=^`~\'<|>&*#@'
198 enders
= '.:,;!?/])}|%-'
199 extra
= string
.punctuation
201 extra
= extra
.replace(c
, '')
202 escaped
= { 'enders' : re
.escape(enders
),
203 'extra' : re
.escape(extra
) }
204 regex
= r
'([\w%(extra)s]*[\s%(enders)s)]+[\s%(enders)s]*)' % escaped
205 r
= re
.compile(regex
, re
.UNICODE
)
206 msg
= [ m
for m
in r
.split(msg
) if not m
== '']
211 # Handle \n on end of line
212 if len(msg
) > 1 and msg
[-1] == 'n' and len(msg
[-2]) > 0 \
213 and msg
[-2][-1] == '\\':
216 # Do not allow a single \n on a line
217 if len(msg
) > 2 and msg
[-1] == '\\n':
222 if len(line
) > width
or len(m
) > width
or len(line
+ m
) > width
:
223 fit
, rest
= splitted_fit(m
, line
, width
, break_always
,
231 lines
= [ '"%s"' % l
for l
in lines
]
232 return '\n'.join(lines
)
234 def normalize(lines
):
236 Normalize <lines>: e.g "\n\nText\n\n" becomes:
242 if 0 < lines
.find('\\n') < len(lines
) - 3:
243 if lines
[-3:] == '\\n"':
244 lines
= lines
[:-3].replace('\\n','\\n"\n"').replace('""\n','') \
247 lines
= lines
.replace('\\n','\\n"\n"').replace('""\n','')
250 def wash(msg
, idx
= 'msgid', width
= 80, **kwds
):
252 Do washing on the msgstr or msgid fields. Wrap the text to fit in
253 width <width>. <msg> is a list of lines that makes up the field.
254 <idx> indicate msgid or msgstr, <width> holds the width. <filename>
255 and <lno> (line number) is picked up from <kwds>.
256 Returns the washed field as a string.
259 lines
= msg
.splitlines()
261 if size
> 1 or len(msg
) > width
:
263 # The first line is special
264 m
= re
.match('^%s "(.*)"$' % (idx
, ), lines
[0])
267 kwds
['lno'] -= size
+ 1
268 raise MsgmergeError('parse error: %(filename)s:%(lno)s.'
270 washed
.append(m
.group(1))
271 if m
.group(1).endswith(r
'\n'):
274 for line
in lines
[1:]:
275 m
= re
.match('^"(\s*.*)"$', line
)
279 kwds
['lno'] -= size
- i
+ 1
280 raise MsgmergeError('parse error: %(filename)s:%(lno)s.'
282 washed
[-1] += m
.group(1)
283 if m
.group(1).endswith(r
'\n'):
290 washed
= [ wrap(w
, width
- 3) for w
in washed
] # " and \n removed.
292 # One line or multiline
293 if len(washed
) == 1 and len('%s %s\n' % (idx
, washed
[0])) < width
:
294 washed
= '%s %s\n' % (idx
, washed
[0])
296 washed
= '%s ""\n%s\n' % (idx
, '\n'.join(washed
))
302 def parse(filename
, entry
):
304 Parse po or pot file with name <filename>. Set the variable
305 <entry> to msgid/msgstr to indicate pot/po file. The return value
306 is a dict with msgid (washed) as key and Msgs instances as
309 lines
= io(filename
).readlines()
314 cmt
= autocmt
= ref
= flag
= ''
317 while not lno
== last
:
319 if l
.startswith('# '):
320 l
, lno
, g
, cmt
= slurp(l
, g
, '# ')
321 if l
.startswith('#.'):
322 l
, lno
, g
, autocmt
= slurp(l
, g
, '#.')
323 if l
.startswith('#:'):
324 l
, lno
, g
, ref
= slurp(l
, g
, '#:')
325 if l
.startswith('#,'):
326 l
, lno
, g
, flag
= slurp(l
, g
, '#,')
327 if l
.startswith('msgid'):
328 l
, lno
, g
, msgid
= slurp(l
, g
, '"')
329 if l
.startswith('msgstr'):
330 l
, lno
, g
, msgstr
= slurp(l
, g
, '"')
332 if not lno
== last
and not l
.strip() == '':
333 raise MsgmergeError('parse error: %s:%s.' % (filename
, lno
))
335 if msgid
and entry
== 'msgstr':
336 idx
= wash(msgid
, filename
= filename
, lno
= lno
)
337 messages
[idx
] = Msgs(msgid
, msgstr
, flag
, lno
, entry
, cmt
= cmt
)
338 msgid
= False; msgstr
= cmt
= autocmt
= ref
= flag
= ''
339 elif msgid
and entry
== 'msgid':
340 idx
= wash(msgid
, filename
= filename
, lno
= lno
)
341 messages
[idx
] = Msgs(msgid
, msgstr
, flag
, lno
, entry
,
342 autocmt
= autocmt
, ref
= ref
)
343 msgid
= False; msgstr
= cmt
= autocmt
= ref
= flag
= ''
345 for m
in messages
.values():
349 def fuzzy_match(pot
, defs
):
351 Try to find the best difflib match (with ratio > 0.6) between
352 id of Msgs object <pot> and Msgs in the dict <defs>.
353 Return value is the Msgs object in <defs> with highest ratio,
354 False is returned if no suitable Msgs is found.
357 l
, po
= limit
- 0.01, False
358 s
= difflib
.SequenceMatcher(lambda x
: x
== ' "', '', pot
.get_clean_id())
359 len2
= len(pot
.get_clean_id())
360 for candidate
in defs
.values():
361 if candidate
.str == 'msgstr ""\n': # Empty translation
363 if candidate
.id == 'msgid ""\n': # Empty msgid (header)
365 len1
= len(candidate
.get_clean_id())
366 if len2
> 2 * len1
or len1
> 1.5 * len2
: # Simple and fast tests first
368 s
.set_seq1(candidate
.get_clean_id())
369 if s
.quick_ratio() < l
:
371 r
= s
.ratio() # This is expensive
376 def flags(po
, pot
, fuzzy
= False, obs
= False):
378 Create flag field from flag field in Msgs objects <po> and
379 <pot>. When <fuzzy> is true <po>\'s flags are ignored and the
380 fuzzy flag is added. If <obs> is set then most flags but fuzzy are
381 removed. If the global variable option.docstrings is set then
382 docstring flags will not be removed. The return value is a string
383 which holds the combined flag.
387 if po
.flag
or pot
.flag
or fuzzy
:
389 flag
= '%s, %s' % (po
.flag
.strip(), pot
.flag
.strip())
391 flag
= '%s, %s' % ('#, fuzzy', pot
.flag
.strip())
392 flag
= flag
.split(', ')
394 flag
= [fl
.setdefault(f
, f
) for f
in flag
if f
not in fl
and f
]
395 if not option
.docstrings
:
397 flag
.remove('docstring')
401 removes
= ['c-format', 'python-format', 'docstring']
402 for remove
in removes
:
408 if 'fuzzy' in flag
and not flag
.index('fuzzy') == 1:
409 i
= flag
.index('fuzzy')
410 flag
[1], flag
[i
] = flag
[i
], flag
[1]
415 flag
= ', '.join(flag
) + '\n'
418 def add(pot
, po
, fuzzy
= False):
420 Build a new entry from the Msgs objects <pot> and <pot>. If
421 <fuzzy> is true, <po>\'s flag field is ignored (in
422 flags()). Returns a multiline string with a up to date entry.
426 msg
.append(pot
.autocmt
)
428 msg
.append(flags(po
, pot
, fuzzy
= fuzzy
))
433 def header(pot
, defs
):
435 Update date in header entry. Returns the updated header entry.
438 [po
] = [ d
for d
in defs
.values() if d
.id == 'msgid ""\n' ]
440 raise MsgmergeError('Error: did not find header in po file.')
442 r
= re
.compile(r
'(.*^"POT-Creation-Date:\s+)(.*?)(\\n"$.*)',
443 re
.MULTILINE | re
.DOTALL
)
447 'Error: did not find POT-Creation-Date field in pot file.')
449 subs
= '\\1%s\\3' % m
.group(2)
450 po
.str, count
= r
.subn(subs
, po
.str)
453 'Error: did not find POT-Creation-Date field in po file.')
456 def match(defs
, refs
):
458 Try to match Msgs objects in <refs> with Msgs objects in
459 <defs>. The return value is a list with po entries.
463 empty
= Msgs('msgid ""\n', 'msgstr ""\n', '', -1, 'str')
464 deco
= [(r
.lno
, r
) for r
in refs
.values() ]
466 po
= header(deco
.pop(0)[1], defs
) # Header entry
467 matches
.append(add(empty
, po
))
469 sorted = [ a
[1] for a
in deco
]
472 sys
.stderr
.write('.')
473 po
= defs
.get(pot
.id, False) # Perfect match
475 matches
.append(add(pot
, po
))
476 po
.used(); pot
.used()
478 po
= fuzzy_match(pot
, defs
) # Fuzzy match
480 matches
.append(add(pot
, po
, fuzzy
= True))
481 po
.used(); pot
.used()
483 matches
.append(add(pot
, empty
)) # No match
485 obsolete(defs
, matches
)
488 def obsolete(defs
, matches
):
489 '''Handle obsolete translations.'''
490 deco
= [ (d
.lno
, d
) for d
in defs
.values() if
491 d
.count
== 0 and not d
.str == 'msgstr ""\n' ]
493 empty
= Msgs('msgid ""\n', 'msgstr ""\n', '', -1, 'str')
494 obs
= [ o
[1] for o
in deco
]
496 o
.flag
= flags(o
, empty
, obs
= True)
498 matches
.append('%s%s%s' % (o
.flag
, o
.id, o
.str))
501 '''Print help text and exit.'''
506 '''Parse options and arguments from command line.'''
507 advice
= 'Try `%(name)s --help\' for more information.'
509 long_opt
= ['help', 'version', 'update', 'output-file=',
510 'quiet', 'silent', 'docstrings']
511 opts
, args
= getopt
.getopt(sys
.argv
[1:], 'hVUo:qD', long_opt
)
512 except getopt
.error
, msg
:
513 print '%s: %s\n%s' % ('%(name)s', msg
, advice
) % globals()
516 option
= Options(cmdline
= True)
517 for opt
, arg
in opts
:
518 if opt
in ['-h', '--help']:
520 elif opt
in ['-V', '--version']:
521 print '%(name)s %(__version__)s' % globals()
523 elif opt
in ['-o', '--output-file']:
525 elif opt
in ['-U', '--update']:
527 elif opt
in ['-q', '--silent', '--quiet']:
528 option
.verbose
= False
529 elif opt
in ['-D', '--docstrings']:
530 option
.docstrings
= True
534 if option
.update
and option
.outfile
:
535 warn
= '--update and --output-file are mutually exclusive.'
537 warn
= 'no input files given.'
538 elif len(args
) == 1 or len(args
) > 2:
539 warn
= 'exactly 2 input files required.'
541 print '%s: %s\n%s' % ('%(name)s', warn
, advice
) % globals()
545 option
.outfile
= args
[0]
546 elif not option
.outfile
:
552 merge(defs
, refs
, option
= option
)
553 except MsgmergeError
, err
:
554 print '%(name)s: ' % globals() + '%s' % err
557 def io(iofile
, mode
= 'r'):
558 '''Wrapper around open().'''
560 fd
= open(iofile
, mode
)
562 raise MsgmergeError('error while opening file: %s: %s.' %
566 def changes(new
, old
):
567 return cmp(''.join(old
), '\n'.join(new
))
569 def write(matches
, outfile
):
570 '''Write the list <matches> to file <outfile>'''
571 if not outfile
== '-':
572 fd
= io(outfile
, 'w')
575 fd
.write('\n'.join(matches
))
577 def merge(def_file
, ref_file
, update
= True, outfile
= '-',
578 docstrings
= True, verbose
= True, **kwds
):
580 Merge po file <def_file> with pot file <ref_file> . If <update> is
581 set to True then only update if there are changes to the po
582 file. Set outfile to write updated po file to an another file. Set
583 to `-\' for writing to standard out. If docstrings is False
584 docstrings flag will removed. Set verbose to False to suppress
585 progress indicators. <kwds> is used to pass options from the
586 command line interface.
589 option
= kwds
.get('option', Options(update
= update
,
591 docstrings
= docstrings
,
593 def_msgs
= parse(def_file
, 'msgstr')
594 ref_msgs
= parse(ref_file
, 'msgid')
595 if verbose
and not __name__
== '__main__':
596 print >> sys
.stderr
, 'Merging %s with %s' % (ref_file
, def_file
)
597 updated_lines
= match(def_msgs
, ref_msgs
)
599 print >> sys
.stderr
, ' done.'
600 if not option
.update
:
601 write(updated_lines
, option
.outfile
)
602 elif option
.update
and changes(updated_lines
, io(def_file
).readlines()):
603 write(updated_lines
, def_file
)
605 def merge_dir(directory
, pot
= False, include
= [], exclude
= [],
608 Tries to merge a directory of po files. Uses simple glob to find
609 po files and pot file. The parameter <pot> can be used to specify
610 the pot file in the directory. If the list <include> is given only
611 files in this list is merged. Use the list <exclude> to exclude
612 files to be merged. This function is only useful if po files and
613 pot file are in the same directory. Set <verbose> to get
614 information when running.
616 if directory
[-1] == '/':
617 directory
= os
.path
.dirname(directory
)
619 pot
= os
.path
.basename(pot
)
621 pot
= glob
.glob('%s/*.pot' % directory
)
623 raise MsgmergeError('No pot file found.')
625 raise MsgmergeError('More than one pot file found: %s.' % pot
)
626 pot
= os
.path
.basename(pot
[0])
629 pos
= glob
.glob('%s/*po' % directory
)
631 raise MsgmergeError('No po file(s) found.')
632 pos
= [ os
.path
.basename(po
) for po
in pos
]
634 pos
= [ os
.path
.basename(po
) for po
in include
]
644 merge(format
% (directory
, po
), format
% (directory
, pot
),
645 update
= True, verbose
= verbose
,
646 outfile
= format
% (directory
, po
))
647 except MsgmergeError
, err
:
649 print >> sys
.stderr
, '%s Not updated.' % err
651 print >> sys
.stderr
, '%s %s not updated.' % (err
, po
)
653 if __name__
== '__main__':