3 # License: MIT (see LICENSE file provided)
4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
8 mo files). You can load existing files, iterate through it's entries, add,
9 modify entries, comments or metadata, etc. or create new po files from scratch.
11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
12 :func:`~polib.mofile` convenience functions.
25 __author__
= 'David Jean Louis <izimobil@gmail.com>'
27 __all__
= ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
28 'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
31 # the default encoding to use when encoding cannot be detected
32 default_encoding
= 'utf-8'
34 # python 2/3 compatibility helpers {{{
37 if sys
.version_info
< (3,):
45 return unicode(s
, "unicode_escape")
52 return s
.encode("latin-1")
57 # _pofile_or_mofile {{{
60 def _pofile_or_mofile(f
, type, **kwargs
):
62 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
63 honor the DRY concept.
65 # get the file encoding
66 enc
= kwargs
.get('encoding')
68 enc
= detect_encoding(f
, type == 'mofile')
71 kls
= type == 'pofile' and _POFileParser
or _MOFileParser
75 check_for_duplicates
=kwargs
.get('check_for_duplicates', False),
76 klass
=kwargs
.get('klass')
78 instance
= parser
.parse()
79 instance
.wrapwidth
= kwargs
.get('wrapwidth', 78)
85 def _is_file(filename_or_contents
):
87 Safely returns the value of os.path.exists(filename_or_contents).
91 ``filename_or_contents``
92 either a filename, or a string holding the contents of some file.
93 In the latter case, this function will always return False.
96 return os
.path
.isfile(filename_or_contents
)
97 except (TypeError, ValueError, UnicodeEncodeError):
100 # function pofile() {{{
103 def pofile(pofile
, **kwargs
):
105 Convenience function that parses the po or pot file ``pofile`` and returns
106 a :class:`~polib.POFile` instance.
111 string, full or relative path to the po/pot file or its content (data).
114 integer, the wrap width, only useful when the ``-w`` option was passed
115 to xgettext (optional, default: ``78``).
118 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
119 encoding will be auto-detected).
121 ``check_for_duplicates``
122 whether to check for duplicate entries when adding entries to the
123 file (optional, default: ``False``).
126 class which is used to instantiate the return value (optional,
127 default: ``None``, the return value with be a :class:`~polib.POFile`
130 return _pofile_or_mofile(pofile
, 'pofile', **kwargs
)
132 # function mofile() {{{
135 def mofile(mofile
, **kwargs
):
137 Convenience function that parses the mo file ``mofile`` and returns a
138 :class:`~polib.MOFile` instance.
143 string, full or relative path to the mo file or its content (string
147 integer, the wrap width, only useful when the ``-w`` option was passed
148 to xgettext to generate the po file that was used to format the mo file
149 (optional, default: ``78``).
152 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
153 encoding will be auto-detected).
155 ``check_for_duplicates``
156 whether to check for duplicate entries when adding entries to the
157 file (optional, default: ``False``).
160 class which is used to instantiate the return value (optional,
161 default: ``None``, the return value with be a :class:`~polib.POFile`
164 return _pofile_or_mofile(mofile
, 'mofile', **kwargs
)
166 # function detect_encoding() {{{
169 def detect_encoding(file, binary_mode
=False):
171 Try to detect the encoding used by the ``file``. The ``file`` argument can
172 be a PO or MO file path or a string containing the contents of the file.
173 If the encoding cannot be detected, the function will return the value of
174 ``default_encoding``.
179 string, full or relative path to the po/mo file or its content.
182 boolean, set this to True if ``file`` is a mo file.
184 PATTERN
= r
'"?Content-Type:.+? charset=([\w_\-:\.]+)'
185 rxt
= re
.compile(u(PATTERN
))
186 rxb
= re
.compile(b(PATTERN
))
188 def charset_exists(charset
):
189 """Check whether ``charset`` is valid or not."""
191 codecs
.lookup(charset
)
196 if not _is_file(file):
198 match
= rxt
.search(file)
200 match
= rxb
.search(file)
202 enc
= match
.group(1).strip()
203 if not isinstance(enc
, text_type
):
204 enc
= enc
.decode('utf-8')
205 if charset_exists(enc
):
208 # For PY3, always treat as binary
209 if binary_mode
or PY3
:
216 for line
in f
.readlines():
217 match
= rx
.search(line
)
220 enc
= match
.group(1).strip()
221 if not isinstance(enc
, text_type
):
222 enc
= enc
.decode('utf-8')
223 if charset_exists(enc
):
226 return default_encoding
228 # function escape() {{{
233 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
234 the given string ``st`` and returns it.
236 return st
.replace('\\', r
'\\')\
237 .replace('\t', r
'\t')\
238 .replace('\r', r
'\r')\
239 .replace('\n', r
'\n')\
240 .replace('\"', r
'\"')
242 # function unescape() {{{
247 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
248 the given string ``st`` and returns it.
250 def unescape_repl(m
):
260 return m
# handles escaped double quote
261 return re
.sub(r
'\\(\\|n|t|r|")', unescape_repl
, st
)
263 # function natural_sort() {{{
266 def natural_sort(lst
):
268 Sort naturally the given list.
269 Credits: http://stackoverflow.com/a/4836734
272 return int(text
) if text
.isdigit() else text
.lower()
274 def alphanum_key(key
):
275 return [convert(c
) for c
in re
.split('([0-9]+)', key
)]
277 return sorted(lst
, key
=alphanum_key
)
280 # class _BaseFile {{{
283 class _BaseFile(list):
285 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
286 classes. This class should **not** be instantiated directly.
289 def __init__(self
, *args
, **kwargs
):
291 Constructor, accepts the following keyword arguments:
294 string, the path to the po or mo file, or its content as a string.
297 integer, the wrap width, only useful when the ``-w`` option was
298 passed to xgettext (optional, default: ``78``).
301 string, the encoding to use, defaults to ``default_encoding``
302 global variable (optional).
304 ``check_for_duplicates``
305 whether to check for duplicate entries when adding entries to the
306 file, (optional, default: ``False``).
309 # the opened file handle
310 pofile
= kwargs
.get('pofile', None)
311 if pofile
and _is_file(pofile
):
314 self
.fpath
= kwargs
.get('fpath')
315 # the width at which lines should be wrapped
316 self
.wrapwidth
= kwargs
.get('wrapwidth', 78)
318 self
.encoding
= kwargs
.get('encoding', default_encoding
)
319 # whether to check for duplicate entries or not
320 self
.check_for_duplicates
= kwargs
.get('check_for_duplicates', False)
323 # both po and mo files have metadata
325 self
.metadata_is_fuzzy
= 0
327 def __unicode__(self
):
329 Returns the unicode representation of the file.
332 entries
= [self
.metadata_as_entry()] + \
333 [e
for e
in self
if not e
.obsolete
]
334 for entry
in entries
:
335 ret
.append(entry
.__unicode
__(self
.wrapwidth
))
336 for entry
in self
.obsolete_entries():
337 ret
.append(entry
.__unicode
__(self
.wrapwidth
))
338 ret
= u('\n').join(ret
)
343 return self
.__unicode
__()
347 Returns the string representation of the file.
349 return unicode(self
).encode(self
.encoding
)
351 def __contains__(self
, entry
):
353 Overridden ``list`` method to implement the membership test (in and
355 The method considers that an entry is in the file if it finds an entry
356 that has the same msgid (the test is **case sensitive**) and the same
357 msgctxt (or none for both entries).
362 an instance of :class:`~polib._BaseEntry`.
364 return self
.find(entry
.msgid
, by
='msgid', msgctxt
=entry
.msgctxt
) \
367 def __eq__(self
, other
):
368 return str(self
) == str(other
)
370 def append(self
, entry
):
372 Overridden method to check for duplicates entries, if a user tries to
373 add an entry that is already in the file, the method will raise a
374 ``ValueError`` exception.
379 an instance of :class:`~polib._BaseEntry`.
381 # check_for_duplicates may not be defined (yet) when unpickling.
382 # But if pickling, we never want to check for duplicates anyway.
383 if getattr(self
, 'check_for_duplicates', False) and entry
in self
:
384 raise ValueError('Entry "%s" already exists' % entry
.msgid
)
385 super(_BaseFile
, self
).append(entry
)
387 def insert(self
, index
, entry
):
389 Overridden method to check for duplicates entries, if a user tries to
390 add an entry that is already in the file, the method will raise a
391 ``ValueError`` exception.
396 index at which the entry should be inserted.
399 an instance of :class:`~polib._BaseEntry`.
401 if self
.check_for_duplicates
and entry
in self
:
402 raise ValueError('Entry "%s" already exists' % entry
.msgid
)
403 super(_BaseFile
, self
).insert(index
, entry
)
405 def metadata_as_entry(self
):
407 Returns the file metadata as a :class:`~polib.POFile` instance.
409 e
= POEntry(msgid
='')
410 mdata
= self
.ordered_metadata()
413 for name
, value
in mdata
:
414 # Strip whitespace off each line in a multi-line entry
415 strs
.append('%s: %s' % (name
, value
))
416 e
.msgstr
= '\n'.join(strs
) + '\n'
417 if self
.metadata_is_fuzzy
:
418 e
.flags
.append('fuzzy')
421 def save(self
, fpath
=None, repr_method
='__unicode__', newline
=None):
423 Saves the po file to ``fpath``.
424 If it is an existing file and no ``fpath`` is provided, then the
425 existing file is rewritten with the modified data.
430 string, full or relative path to the file.
433 string, the method to use for output.
436 string, controls how universal newlines works
438 if self
.fpath
is None and fpath
is None:
439 raise IOError('You must provide a file path to save() method')
440 contents
= getattr(self
, repr_method
)()
443 if repr_method
== 'to_binary':
444 fhandle
= open(fpath
, 'wb')
449 encoding
=self
.encoding
,
452 if not isinstance(contents
, text_type
):
453 contents
= contents
.decode(self
.encoding
)
454 fhandle
.write(contents
)
456 # set the file path if not set
457 if self
.fpath
is None and fpath
:
460 def find(self
, st
, by
='msgid', include_obsolete_entries
=False,
463 Find the entry which msgid (or property identified by the ``by``
464 argument) matches the string ``st``.
469 string, the string to search for.
472 string, the property to use for comparison (default: ``msgid``).
474 ``include_obsolete_entries``
475 boolean, whether to also search in entries that are obsolete.
478 string, allows specifying a specific message context for the
481 if include_obsolete_entries
:
484 entries
= [e
for e
in self
if not e
.obsolete
]
487 if getattr(e
, by
) == st
:
488 if msgctxt
is not False and e
.msgctxt
!= msgctxt
:
491 if len(matches
) == 1:
493 elif len(matches
) > 1:
495 # find the entry with no msgctx
502 # fallback to the first entry found
506 def ordered_metadata(self
):
508 Convenience method that returns an ordered version of the metadata
509 dictionary. The return value is list of tuples (metadata name,
512 # copy the dict first
513 metadata
= self
.metadata
.copy()
515 'Project-Id-Version',
516 'Report-Msgid-Bugs-To',
524 'Content-Transfer-Encoding',
528 for data
in data_order
:
530 value
= metadata
.pop(data
)
531 ordered_data
.append((data
, value
))
534 # the rest of the metadata will be alphabetically ordered since there
535 # are no specs for this AFAIK
536 for data
in natural_sort(metadata
.keys()):
537 value
= metadata
[data
]
538 ordered_data
.append((data
, value
))
543 Return the binary representation of the file.
546 entries
= self
.translated_entries()
548 # the keys are sorted in the .mo file
549 def cmp(_self
, other
):
550 # msgfmt compares entries with msgctxt if it exists
551 self_msgid
= _self
.msgctxt
and _self
.msgctxt
or _self
.msgid
552 other_msgid
= other
.msgctxt
and other
.msgctxt
or other
.msgid
553 if self_msgid
> other_msgid
:
555 elif self_msgid
< other_msgid
:
560 entries
.sort(key
=lambda o
: o
.msgid_with_context
.encode('utf-8'))
561 mentry
= self
.metadata_as_entry()
562 entries
= [mentry
] + entries
563 entries_len
= len(entries
)
564 ids
, strs
= b(''), b('')
566 # For each string, we need size and file offset. Each string is
567 # NUL terminated; the NUL does not count into the size.
570 # Contexts are stored by storing the concatenation of the
571 # context, a <EOT> byte, and the original string
572 msgid
= self
._encode
(e
.msgctxt
+ '\4')
575 for index
in sorted(e
.msgstr_plural
.keys()):
576 msgstr
.append(e
.msgstr_plural
[index
])
577 msgid
+= self
._encode
(e
.msgid
+ '\0' + e
.msgid_plural
)
578 msgstr
= self
._encode
('\0'.join(msgstr
))
580 msgid
+= self
._encode
(e
.msgid
)
581 msgstr
= self
._encode
(e
.msgstr
)
582 offsets
.append((len(ids
), len(msgid
), len(strs
), len(msgstr
)))
583 ids
+= msgid
+ b('\0')
584 strs
+= msgstr
+ b('\0')
586 # The header is 7 32-bit unsigned integers.
587 keystart
= 7 * 4 + 16 * entries_len
588 # and the values start after the keys
589 valuestart
= keystart
+ len(ids
)
592 # The string table first has the list of keys, then the list of values.
593 # Each entry has first the size of the string, then the file offset.
594 for o1
, l1
, o2
, l2
in offsets
:
595 koffsets
+= [l1
, o1
+ keystart
]
596 voffsets
+= [l2
, o2
+ valuestart
]
597 offsets
= koffsets
+ voffsets
599 output
= struct
.pack(
609 # start of value index
610 7 * 4 + entries_len
* 8,
611 # size and offset of hash table, we don't use hash tables
615 if PY3
and sys
.version_info
.minor
> 1: # python 3.2 or superior
616 output
+= array
.array("i", offsets
).tobytes()
618 output
+= array
.array("i", offsets
).tostring()
623 def _encode(self
, mixed
):
625 Encodes the given ``mixed`` argument with the file encoding if and
626 only if it's an unicode string and returns the encoded string.
628 if isinstance(mixed
, text_type
):
629 mixed
= mixed
.encode(self
.encoding
)
635 class POFile(_BaseFile
):
637 Po (or Pot) file reader/writer.
638 This class inherits the :class:`~polib._BaseFile` class and, by extension,
639 the python ``list`` type.
642 def __unicode__(self
):
644 Returns the unicode representation of the po file.
646 ret
, headers
= '', self
.header
.split('\n')
647 for header
in headers
:
650 elif header
[:1] in [',', ':']:
651 ret
+= '#%s\n' % header
653 ret
+= '# %s\n' % header
655 if not isinstance(ret
, text_type
):
656 ret
= ret
.decode(self
.encoding
)
658 return ret
+ _BaseFile
.__unicode
__(self
)
660 def save_as_mofile(self
, fpath
):
662 Saves the binary representation of the file to given ``fpath``.
667 string, full or relative path to the mo file.
669 _BaseFile
.save(self
, fpath
, 'to_binary')
671 def percent_translated(self
):
673 Convenience method that returns the percentage of translated
676 total
= len([e
for e
in self
if not e
.obsolete
])
679 translated
= len(self
.translated_entries())
680 return int(translated
* 100 / float(total
))
682 def translated_entries(self
):
684 Convenience method that returns the list of translated entries.
686 return [e
for e
in self
if e
.translated()]
688 def untranslated_entries(self
):
690 Convenience method that returns the list of untranslated entries.
692 return [e
for e
in self
if not e
.translated() and not e
.obsolete
695 def fuzzy_entries(self
):
697 Convenience method that returns the list of fuzzy entries.
699 return [e
for e
in self
if e
.fuzzy
and not e
.obsolete
]
701 def obsolete_entries(self
):
703 Convenience method that returns the list of obsolete entries.
705 return [e
for e
in self
if e
.obsolete
]
707 def merge(self
, refpot
):
709 Convenience method that merges the current pofile with the pot file
710 provided. It behaves exactly as the gettext msgmerge utility:
712 * comments of this file will be preserved, but extracted comments and
713 occurrences will be discarded;
714 * any translations or comments in the file will be discarded, however,
715 dot comments and file positions will be preserved;
716 * the fuzzy flags are preserved.
721 object POFile, the reference catalog.
723 # Store entries in dict/set for faster access
725 (entry
.msgid_with_context
, entry
) for entry
in self
727 refpot_msgids
= set(entry
.msgid_with_context
for entry
in refpot
)
728 # Merge entries that are in the refpot
730 e
= self_entries
.get(entry
.msgid_with_context
)
735 # ok, now we must "obsolete" entries that are not in the refpot anymore
737 if entry
.msgid_with_context
not in refpot_msgids
:
738 entry
.obsolete
= True
743 class MOFile(_BaseFile
):
745 Mo file reader/writer.
746 This class inherits the :class:`~polib._BaseFile` class and, by
747 extension, the python ``list`` type.
750 MAGIC_SWAPPED
= 0xde120495
752 def __init__(self
, *args
, **kwargs
):
754 Constructor, accepts all keywords arguments accepted by
755 :class:`~polib._BaseFile` class.
757 _BaseFile
.__init
__(self
, *args
, **kwargs
)
758 self
.magic_number
= None
761 def save_as_pofile(self
, fpath
):
763 Saves the mofile as a pofile to ``fpath``.
768 string, full or relative path to the file.
770 _BaseFile
.save(self
, fpath
)
772 def save(self
, fpath
=None):
774 Saves the mofile to ``fpath``.
779 string, full or relative path to the file.
781 _BaseFile
.save(self
, fpath
, 'to_binary')
783 def percent_translated(self
):
785 Convenience method to keep the same interface with POFile instances.
789 def translated_entries(self
):
791 Convenience method to keep the same interface with POFile instances.
795 def untranslated_entries(self
):
797 Convenience method to keep the same interface with POFile instances.
801 def fuzzy_entries(self
):
803 Convenience method to keep the same interface with POFile instances.
807 def obsolete_entries(self
):
809 Convenience method to keep the same interface with POFile instances.
813 # class _BaseEntry {{{
816 class _BaseEntry(object):
818 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
819 This class should **not** be instantiated directly.
822 def __init__(self
, *args
, **kwargs
):
824 Constructor, accepts the following keyword arguments:
827 string, the entry msgid.
830 string, the entry msgstr.
833 string, the entry msgid_plural.
836 dict, the entry msgstr_plural lines.
839 string, the entry context (msgctxt).
842 bool, whether the entry is "obsolete" or not.
845 string, the encoding to use, defaults to ``default_encoding``
846 global variable (optional).
848 self
.msgid
= kwargs
.get('msgid', '')
849 self
.msgstr
= kwargs
.get('msgstr', '')
850 self
.msgid_plural
= kwargs
.get('msgid_plural', '')
851 self
.msgstr_plural
= kwargs
.get('msgstr_plural', {})
852 self
.msgctxt
= kwargs
.get('msgctxt', None)
853 self
.obsolete
= kwargs
.get('obsolete', False)
854 self
.encoding
= kwargs
.get('encoding', default_encoding
)
856 def __unicode__(self
, wrapwidth
=78):
858 Returns the unicode representation of the entry.
865 # write the msgctxt if any
866 if self
.msgctxt
is not None:
867 ret
+= self
._str
_field
("msgctxt", delflag
, "", self
.msgctxt
,
870 ret
+= self
._str
_field
("msgid", delflag
, "", self
.msgid
, wrapwidth
)
871 # write the msgid_plural if any
872 if self
.msgid_plural
:
873 ret
+= self
._str
_field
("msgid_plural", delflag
, "",
874 self
.msgid_plural
, wrapwidth
)
875 if self
.msgstr_plural
:
876 # write the msgstr_plural if any
877 msgstrs
= self
.msgstr_plural
881 msgstr
= msgstrs
[index
]
882 plural_index
= '[%s]' % index
883 ret
+= self
._str
_field
("msgstr", delflag
, plural_index
, msgstr
,
886 # otherwise write the msgstr
887 ret
+= self
._str
_field
("msgstr", delflag
, "", self
.msgstr
,
890 ret
= u('\n').join(ret
)
895 return self
.__unicode
__()
899 Returns the string representation of the entry.
901 return unicode(self
).encode(self
.encoding
)
903 def __eq__(self
, other
):
904 return str(self
) == str(other
)
906 def _str_field(self
, fieldname
, delflag
, plural_index
, field
,
908 lines
= field
.splitlines(True)
910 lines
= [''] + lines
# start with initial empty line
912 escaped_field
= escape(field
)
913 specialchars_count
= 0
914 for c
in ['\\', '\n', '\r', '\t', '"']:
915 specialchars_count
+= field
.count(c
)
916 # comparison must take into account fieldname length + one space
917 # + 2 quotes (eg. msgid "<string>")
918 flength
= len(fieldname
) + 3
920 flength
+= len(plural_index
)
921 real_wrapwidth
= wrapwidth
- flength
+ specialchars_count
922 if wrapwidth
> 0 and len(field
) > real_wrapwidth
:
923 # Wrap the line but take field name into account
924 lines
= [''] + [unescape(item
) for item
in textwrap
.wrap(
926 wrapwidth
- 2, # 2 for quotes ""
927 drop_whitespace
=False,
928 break_long_words
=False
932 if fieldname
.startswith('previous_'):
933 # quick and dirty trick to get the real field name
934 fieldname
= fieldname
[9:]
936 ret
= ['%s%s%s "%s"' % (delflag
, fieldname
, plural_index
,
937 escape(lines
.pop(0)))]
939 ret
.append('%s"%s"' % (delflag
, escape(line
)))
943 def msgid_with_context(self
):
945 return '%s%s%s' % (self
.msgctxt
, "\x04", self
.msgid
)
951 class POEntry(_BaseEntry
):
953 Represents a po file entry.
956 def __init__(self
, *args
, **kwargs
):
958 Constructor, accepts the following keyword arguments:
961 string, the entry comment.
964 string, the entry translator comment.
967 list, the entry occurrences.
970 list, the entry flags.
973 string, the entry previous context.
976 string, the entry previous msgid.
978 ``previous_msgid_plural``
979 string, the entry previous msgid_plural.
982 integer, the line number of the entry
984 _BaseEntry
.__init
__(self
, *args
, **kwargs
)
985 self
.comment
= kwargs
.get('comment', '')
986 self
.tcomment
= kwargs
.get('tcomment', '')
987 self
.occurrences
= kwargs
.get('occurrences', [])
988 self
.flags
= kwargs
.get('flags', [])
989 self
.previous_msgctxt
= kwargs
.get('previous_msgctxt', None)
990 self
.previous_msgid
= kwargs
.get('previous_msgid', None)
991 self
.previous_msgid_plural
= kwargs
.get('previous_msgid_plural', None)
992 self
.linenum
= kwargs
.get('linenum', None)
994 def __unicode__(self
, wrapwidth
=78):
996 Returns the unicode representation of the entry.
999 # comments first, if any (with text wrapping as xgettext does)
1001 comments
= [('tcomment', '# ')]
1003 comments
= [('comment', '#. '), ('tcomment', '# ')]
1005 val
= getattr(self
, c
[0])
1007 for comment
in val
.split('\n'):
1008 if wrapwidth
> 0 and len(comment
) + len(c
[1]) > wrapwidth
:
1009 ret
+= textwrap
.wrap(
1012 initial_indent
=c
[1],
1013 subsequent_indent
=c
[1],
1014 break_long_words
=False
1017 ret
.append('%s%s' % (c
[1], comment
))
1019 # occurrences (with text wrapping as xgettext does)
1020 if not self
.obsolete
and self
.occurrences
:
1022 for fpath
, lineno
in self
.occurrences
:
1024 filelist
.append('%s:%s' % (fpath
, lineno
))
1026 filelist
.append(fpath
)
1027 filestr
= ' '.join(filelist
)
1028 if wrapwidth
> 0 and len(filestr
) + 3 > wrapwidth
:
1029 # textwrap split words that contain hyphen, this is not
1030 # what we want for filenames, so the dirty hack is to
1031 # temporally replace hyphens with a char that a file cannot
1033 ret
+= [line
.replace('*', '-') for line
in textwrap
.wrap(
1034 filestr
.replace('-', '*'),
1036 initial_indent
='#: ',
1037 subsequent_indent
='#: ',
1038 break_long_words
=False
1041 ret
.append('#: ' + filestr
)
1043 # flags (TODO: wrapping ?)
1045 ret
.append('#, %s' % ', '.join(self
.flags
))
1047 # previous context and previous msgid/msgid_plural
1048 fields
= ['previous_msgctxt', 'previous_msgid',
1049 'previous_msgid_plural']
1055 val
= getattr(self
, f
)
1057 ret
+= self
._str
_field
(f
, prefix
, "", val
, wrapwidth
)
1059 ret
.append(_BaseEntry
.__unicode
__(self
, wrapwidth
))
1060 ret
= u('\n').join(ret
)
1063 def __cmp__(self
, other
):
1065 Called by comparison operations if rich comparison is not defined.
1067 # First: Obsolete test
1068 if self
.obsolete
!= other
.obsolete
:
1073 # Work on a copy to protect original
1074 occ1
= sorted(self
.occurrences
[:])
1075 occ2
= sorted(other
.occurrences
[:])
1081 msgctxt
= self
.msgctxt
or '0'
1082 othermsgctxt
= other
.msgctxt
or '0'
1083 if msgctxt
> othermsgctxt
:
1085 elif msgctxt
< othermsgctxt
:
1087 # Compare msgid_plural
1088 msgid_plural
= self
.msgid_plural
or '0'
1089 othermsgid_plural
= other
.msgid_plural
or '0'
1090 if msgid_plural
> othermsgid_plural
:
1092 elif msgid_plural
< othermsgid_plural
:
1094 # Compare msgstr_plural
1095 if self
.msgstr_plural
and isinstance(self
.msgstr_plural
, dict):
1096 msgstr_plural
= list(self
.msgstr_plural
.values())
1099 if other
.msgstr_plural
and isinstance(other
.msgstr_plural
, dict):
1100 othermsgstr_plural
= list(other
.msgstr_plural
.values())
1102 othermsgstr_plural
= []
1103 if msgstr_plural
> othermsgstr_plural
:
1105 elif msgstr_plural
< othermsgstr_plural
:
1108 if self
.msgid
> other
.msgid
:
1110 elif self
.msgid
< other
.msgid
:
1113 if self
.msgstr
> other
.msgstr
:
1115 elif self
.msgstr
< other
.msgstr
:
1119 def __gt__(self
, other
):
1120 return self
.__cmp
__(other
) > 0
1122 def __lt__(self
, other
):
1123 return self
.__cmp
__(other
) < 0
1125 def __ge__(self
, other
):
1126 return self
.__cmp
__(other
) >= 0
1128 def __le__(self
, other
):
1129 return self
.__cmp
__(other
) <= 0
1131 def __eq__(self
, other
):
1132 return self
.__cmp
__(other
) == 0
1134 def __ne__(self
, other
):
1135 return self
.__cmp
__(other
) != 0
1137 def translated(self
):
1139 Returns ``True`` if the entry has been translated or ``False``
1142 if self
.obsolete
or self
.fuzzy
:
1144 if self
.msgstr
!= '':
1146 if self
.msgstr_plural
:
1147 for pos
in self
.msgstr_plural
:
1148 if self
.msgstr_plural
[pos
] == '':
1153 def merge(self
, other
):
1155 Merge the current entry with the given pot entry.
1157 self
.msgid
= other
.msgid
1158 self
.msgctxt
= other
.msgctxt
1159 self
.occurrences
= other
.occurrences
1160 self
.comment
= other
.comment
1162 self
.flags
= other
.flags
[:] # clone flags
1164 self
.flags
.append('fuzzy')
1165 self
.msgid_plural
= other
.msgid_plural
1166 self
.obsolete
= other
.obsolete
1167 self
.previous_msgctxt
= other
.previous_msgctxt
1168 self
.previous_msgid
= other
.previous_msgid
1169 self
.previous_msgid_plural
= other
.previous_msgid_plural
1170 if other
.msgstr_plural
:
1171 for pos
in other
.msgstr_plural
:
1173 # keep existing translation at pos if any
1174 self
.msgstr_plural
[pos
]
1176 self
.msgstr_plural
[pos
] = ''
1180 return 'fuzzy' in self
.flags
1183 return hash((self
.msgid
, self
.msgstr
))
1188 class MOEntry(_BaseEntry
):
1190 Represents a mo file entry.
1192 def __init__(self
, *args
, **kwargs
):
1194 Constructor, accepts the following keyword arguments,
1195 for consistency with :class:`~polib.POEntry`:
1201 ``previous_msgctxt``
1203 ``previous_msgid_plural``
1205 Note: even though these keyword arguments are accepted,
1206 they hold no real meaning in the context of MO files
1207 and are simply ignored.
1209 _BaseEntry
.__init
__(self
, *args
, **kwargs
)
1212 self
.occurrences
= []
1214 self
.previous_msgctxt
= None
1215 self
.previous_msgid
= None
1216 self
.previous_msgid_plural
= None
1219 return hash((self
.msgid
, self
.msgstr
))
1222 # class _POFileParser {{{
1225 class _POFileParser(object):
1227 A finite state machine to parse efficiently and correctly po
1231 def __init__(self
, pofile
, *args
, **kwargs
):
1238 string, path to the po file or its content
1241 string, the encoding to use, defaults to ``default_encoding``
1242 global variable (optional).
1244 ``check_for_duplicates``
1245 whether to check for duplicate entries when adding entries to the
1246 file (optional, default: ``False``).
1248 enc
= kwargs
.get('encoding', default_encoding
)
1249 if _is_file(pofile
):
1251 self
.fhandle
= io
.open(pofile
, 'rt', encoding
=enc
)
1253 enc
= default_encoding
1254 self
.fhandle
= io
.open(pofile
, 'rt', encoding
=enc
)
1256 self
.fhandle
= pofile
.splitlines()
1258 klass
= kwargs
.get('klass')
1261 self
.instance
= klass(
1264 check_for_duplicates
=kwargs
.get('check_for_duplicates', False)
1266 self
.transitions
= {}
1267 self
.current_line
= 0
1268 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1269 self
.current_state
= 'st'
1270 self
.current_token
= None
1271 # two memo flags used in handlers
1272 self
.msgstr_index
= 0
1273 self
.entry_obsolete
= 0
1274 # Configure the state machine, by adding transitions.
1275 # Signification of symbols:
1276 # * ST: Beginning of the file (start)
1278 # * TC: a translation comment
1279 # * GC: a generated comment
1280 # * OC: a file/line occurrence
1281 # * FL: a flags line
1282 # * CT: a message context
1283 # * PC: a previous msgctxt
1284 # * PM: a previous msgid
1285 # * PP: a previous msgid_plural
1287 # * MP: a msgid plural
1289 # * MX: a msgstr plural
1290 # * MC: a msgid or msgstr continuation line
1291 all
= ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
1292 'ms', 'mp', 'mx', 'mi']
1294 self
.add('tc', ['st', 'he'], 'he')
1295 self
.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
1296 'mp', 'mx', 'mi'], 'tc')
1297 self
.add('gc', all
, 'gc')
1298 self
.add('oc', all
, 'oc')
1299 self
.add('fl', all
, 'fl')
1300 self
.add('pc', all
, 'pc')
1301 self
.add('pm', all
, 'pm')
1302 self
.add('pp', all
, 'pp')
1303 self
.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
1304 'pp', 'ms', 'mx'], 'ct')
1305 self
.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
1306 'pm', 'pp', 'ms', 'mx'], 'mi')
1307 self
.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1308 self
.add('ms', ['mi', 'mp', 'tc'], 'ms')
1309 self
.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1310 self
.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1314 Run the state machine, parse the file line by line and call process()
1315 with the current matched symbol.
1322 'msgid_plural': 'mp',
1325 'msgid_plural': 'pp',
1330 fpath
= '%s ' % self
.instance
.fpath
if self
.instance
.fpath
else ''
1331 for line
in self
.fhandle
:
1332 self
.current_line
+= 1
1333 if self
.current_line
== 1:
1334 BOM
= codecs
.BOM_UTF8
.decode('utf-8')
1335 if line
.startswith(BOM
):
1336 line
= line
[len(BOM
):]
1341 tokens
= line
.split(None, 2)
1342 nb_tokens
= len(tokens
)
1344 if tokens
[0] == '#~|':
1347 if tokens
[0] == '#~' and nb_tokens
> 1:
1348 line
= line
[3:].strip()
1351 self
.entry_obsolete
= 1
1353 self
.entry_obsolete
= 0
1355 # Take care of keywords like
1356 # msgid, msgid_plural, msgctxt & msgstr.
1357 if tokens
[0] in keywords
and nb_tokens
> 1:
1358 line
= line
[len(tokens
[0]):].lstrip()
1359 if re
.search(r
'([^\\]|^)"', line
[1:-1]):
1360 raise IOError('Syntax error in po file %s(line %s): '
1361 'unescaped double quote found' %
1362 (fpath
, self
.current_line
))
1363 self
.current_token
= line
1364 self
.process(keywords
[tokens
[0]])
1367 self
.current_token
= line
1369 if tokens
[0] == '#:':
1372 # we are on a occurrences line
1375 elif line
[:1] == '"':
1376 # we are on a continuation line
1377 if re
.search(r
'([^\\]|^)"', line
[1:-1]):
1378 raise IOError('Syntax error in po file %s(line %s): '
1379 'unescaped double quote found' %
1380 (fpath
, self
.current_line
))
1383 elif line
[:7] == 'msgstr[':
1384 # we are on a msgstr plural
1387 elif tokens
[0] == '#,':
1390 # we are on a flags line
1393 elif tokens
[0] == '#' or tokens
[0].startswith('##'):
1396 # we are on a translator comment line
1399 elif tokens
[0] == '#.':
1402 # we are on a generated comment line
1405 elif tokens
[0] == '#|':
1407 raise IOError('Syntax error in po file %s(line %s)' %
1408 (fpath
, self
.current_line
))
1410 # Remove the marker and any whitespace right after that.
1411 line
= line
[2:].lstrip()
1412 self
.current_token
= line
1414 if tokens
[1].startswith('"'):
1415 # Continuation of previous metadata.
1420 # Invalid continuation line.
1421 raise IOError('Syntax error in po file %s(line %s): '
1422 'invalid continuation line' %
1423 (fpath
, self
.current_line
))
1425 # we are on a "previous translation" comment line,
1426 if tokens
[1] not in prev_keywords
:
1427 # Unknown keyword in previous translation comment.
1428 raise IOError('Syntax error in po file %s(line %s): '
1429 'unknown keyword %s' %
1430 (fpath
, self
.current_line
,
1433 # Remove the keyword and any whitespace
1434 # between it and the starting quote.
1435 line
= line
[len(tokens
[1]):].lstrip()
1436 self
.current_token
= line
1437 self
.process(prev_keywords
[tokens
[1]])
1440 raise IOError('Syntax error in po file %s(line %s)' %
1441 (fpath
, self
.current_line
))
1443 if self
.current_entry
and len(tokens
) > 0 and \
1444 not tokens
[0].startswith('#'):
1445 # since entries are added when another entry is found, we must add
1446 # the last entry here (only if there are lines). Trailing comments
1448 self
.instance
.append(self
.current_entry
)
1450 # before returning the instance, check if there's metadata and if
1451 # so extract it in a dict
1452 metadataentry
= self
.instance
.find('')
1453 if metadataentry
: # metadata found
1455 self
.instance
.remove(metadataentry
)
1456 self
.instance
.metadata_is_fuzzy
= metadataentry
.flags
1458 for msg
in metadataentry
.msgstr
.splitlines():
1460 key
, val
= msg
.split(':', 1)
1461 self
.instance
.metadata
[key
] = val
.strip()
1462 except (ValueError, KeyError):
1464 self
.instance
.metadata
[key
] += '\n' + msg
.strip()
1466 if not isinstance(self
.fhandle
, list): # must be file
1467 self
.fhandle
.close()
1468 return self
.instance
1470 def add(self
, symbol
, states
, next_state
):
1472 Add a transition to the state machine.
1477 string, the matched token (two chars symbol).
1480 list, a list of states (two chars symbols).
1483 the next state the fsm will have after the action.
1485 for state
in states
:
1486 action
= getattr(self
, 'handle_%s' % next_state
)
1487 self
.transitions
[(symbol
, state
)] = (action
, next_state
)
1489 def process(self
, symbol
):
1491 Process the transition corresponding to the current state and the
1497 string, the matched token (two chars symbol).
1500 integer, the current line number of the parsed file.
1503 (action
, state
) = self
.transitions
[(symbol
, self
.current_state
)]
1505 self
.current_state
= state
1507 fpath
= '%s ' % self
.instance
.fpath
if self
.instance
.fpath
else ''
1508 if hasattr(self
.fhandle
, 'close'):
1509 self
.fhandle
.close()
1510 raise IOError('Syntax error in po file %s(line %s)' %
1511 (fpath
, self
.current_line
))
1515 def handle_he(self
):
1516 """Handle a header comment."""
1517 if self
.instance
.header
!= '':
1518 self
.instance
.header
+= '\n'
1519 self
.instance
.header
+= self
.current_token
[2:]
1522 def handle_tc(self
):
1523 """Handle a translator comment."""
1524 if self
.current_state
in ['mc', 'ms', 'mx']:
1525 self
.instance
.append(self
.current_entry
)
1526 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1527 if self
.current_entry
.tcomment
!= '':
1528 self
.current_entry
.tcomment
+= '\n'
1529 tcomment
= self
.current_token
.lstrip('#')
1530 if tcomment
.startswith(' '):
1531 tcomment
= tcomment
[1:]
1532 self
.current_entry
.tcomment
+= tcomment
1535 def handle_gc(self
):
1536 """Handle a generated comment."""
1537 if self
.current_state
in ['mc', 'ms', 'mx']:
1538 self
.instance
.append(self
.current_entry
)
1539 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1540 if self
.current_entry
.comment
!= '':
1541 self
.current_entry
.comment
+= '\n'
1542 self
.current_entry
.comment
+= self
.current_token
[3:]
1545 def handle_oc(self
):
1546 """Handle a file:num occurrence."""
1547 if self
.current_state
in ['mc', 'ms', 'mx']:
1548 self
.instance
.append(self
.current_entry
)
1549 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1550 occurrences
= self
.current_token
[3:].split()
1551 for occurrence
in occurrences
:
1552 if occurrence
!= '':
1554 fil
, line
= occurrence
.rsplit(':', 1)
1555 if not line
.isdigit():
1558 self
.current_entry
.occurrences
.append((fil
, line
))
1559 except (ValueError, AttributeError):
1560 self
.current_entry
.occurrences
.append((occurrence
, ''))
1563 def handle_fl(self
):
1564 """Handle a flags line."""
1565 if self
.current_state
in ['mc', 'ms', 'mx']:
1566 self
.instance
.append(self
.current_entry
)
1567 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1568 self
.current_entry
.flags
+= [c
.strip() for c
in
1569 self
.current_token
[3:].split(',')]
1572 def handle_pp(self
):
1573 """Handle a previous msgid_plural line."""
1574 if self
.current_state
in ['mc', 'ms', 'mx']:
1575 self
.instance
.append(self
.current_entry
)
1576 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1577 self
.current_entry
.previous_msgid_plural
= \
1578 unescape(self
.current_token
[1:-1])
1581 def handle_pm(self
):
1582 """Handle a previous msgid line."""
1583 if self
.current_state
in ['mc', 'ms', 'mx']:
1584 self
.instance
.append(self
.current_entry
)
1585 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1586 self
.current_entry
.previous_msgid
= \
1587 unescape(self
.current_token
[1:-1])
1590 def handle_pc(self
):
1591 """Handle a previous msgctxt line."""
1592 if self
.current_state
in ['mc', 'ms', 'mx']:
1593 self
.instance
.append(self
.current_entry
)
1594 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1595 self
.current_entry
.previous_msgctxt
= \
1596 unescape(self
.current_token
[1:-1])
1599 def handle_ct(self
):
1600 """Handle a msgctxt."""
1601 if self
.current_state
in ['mc', 'ms', 'mx']:
1602 self
.instance
.append(self
.current_entry
)
1603 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1604 self
.current_entry
.msgctxt
= unescape(self
.current_token
[1:-1])
1607 def handle_mi(self
):
1608 """Handle a msgid."""
1609 if self
.current_state
in ['mc', 'ms', 'mx']:
1610 self
.instance
.append(self
.current_entry
)
1611 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1612 self
.current_entry
.obsolete
= self
.entry_obsolete
1613 self
.current_entry
.msgid
= unescape(self
.current_token
[1:-1])
1616 def handle_mp(self
):
1617 """Handle a msgid plural."""
1618 self
.current_entry
.msgid_plural
= unescape(self
.current_token
[1:-1])
1621 def handle_ms(self
):
1622 """Handle a msgstr."""
1623 self
.current_entry
.msgstr
= unescape(self
.current_token
[1:-1])
1626 def handle_mx(self
):
1627 """Handle a msgstr plural."""
1628 index
= self
.current_token
[7]
1629 value
= self
.current_token
[self
.current_token
.find('"') + 1:-1]
1630 self
.current_entry
.msgstr_plural
[int(index
)] = unescape(value
)
1631 self
.msgstr_index
= int(index
)
1634 def handle_mc(self
):
1635 """Handle a msgid or msgstr continuation line."""
1636 token
= unescape(self
.current_token
[1:-1])
1637 if self
.current_state
== 'ct':
1638 self
.current_entry
.msgctxt
+= token
1639 elif self
.current_state
== 'mi':
1640 self
.current_entry
.msgid
+= token
1641 elif self
.current_state
== 'mp':
1642 self
.current_entry
.msgid_plural
+= token
1643 elif self
.current_state
== 'ms':
1644 self
.current_entry
.msgstr
+= token
1645 elif self
.current_state
== 'mx':
1646 self
.current_entry
.msgstr_plural
[self
.msgstr_index
] += token
1647 elif self
.current_state
== 'pp':
1648 self
.current_entry
.previous_msgid_plural
+= token
1649 elif self
.current_state
== 'pm':
1650 self
.current_entry
.previous_msgid
+= token
1651 elif self
.current_state
== 'pc':
1652 self
.current_entry
.previous_msgctxt
+= token
1653 # don't change the current state
1656 # class _MOFileParser {{{
1659 class _MOFileParser(object):
1661 A class to parse binary mo files.
1664 def __init__(self
, mofile
, *args
, **kwargs
):
1671 string, path to the mo file or its content
1674 string, the encoding to use, defaults to ``default_encoding``
1675 global variable (optional).
1677 ``check_for_duplicates``
1678 whether to check for duplicate entries when adding entries to the
1679 file (optional, default: ``False``).
1681 if _is_file(mofile
):
1682 self
.fhandle
= open(mofile
, 'rb')
1684 self
.fhandle
= io
.BytesIO(mofile
)
1686 klass
= kwargs
.get('klass')
1689 self
.instance
= klass(
1691 encoding
=kwargs
.get('encoding', default_encoding
),
1692 check_for_duplicates
=kwargs
.get('check_for_duplicates', False)
1697 Make sure the file is closed, this prevents warnings on unclosed file
1698 when running tests with python >= 3.2.
1700 if self
.fhandle
and hasattr(self
.fhandle
, 'close'):
1701 self
.fhandle
.close()
1705 Build the instance with the file handle provided in the
1708 # parse magic number
1709 magic_number
= self
._readbinary
('<I', 4)
1710 if magic_number
== MOFile
.MAGIC
:
1712 elif magic_number
== MOFile
.MAGIC_SWAPPED
:
1715 raise IOError('Invalid mo file, magic number is incorrect !')
1716 self
.instance
.magic_number
= magic_number
1717 # parse the version number and the number of strings
1718 version
, numofstrings
= self
._readbinary
(ii
, 8)
1719 # from MO file format specs: "A program seeing an unexpected major
1720 # revision number should stop reading the MO file entirely"
1721 if version
>> 16 not in (0, 1):
1722 raise IOError('Invalid mo file, unexpected major revision number')
1723 self
.instance
.version
= version
1724 # original strings and translation strings hash table offset
1725 msgids_hash_offset
, msgstrs_hash_offset
= self
._readbinary
(ii
, 8)
1726 # move to msgid hash table and read length and offset of msgids
1727 self
.fhandle
.seek(msgids_hash_offset
)
1729 for i
in range(numofstrings
):
1730 msgids_index
.append(self
._readbinary
(ii
, 8))
1731 # move to msgstr hash table and read length and offset of msgstrs
1732 self
.fhandle
.seek(msgstrs_hash_offset
)
1734 for i
in range(numofstrings
):
1735 msgstrs_index
.append(self
._readbinary
(ii
, 8))
1737 encoding
= self
.instance
.encoding
1738 for i
in range(numofstrings
):
1739 self
.fhandle
.seek(msgids_index
[i
][1])
1740 msgid
= self
.fhandle
.read(msgids_index
[i
][0])
1742 self
.fhandle
.seek(msgstrs_index
[i
][1])
1743 msgstr
= self
.fhandle
.read(msgstrs_index
[i
][0])
1744 if i
== 0 and not msgid
: # metadata
1745 raw_metadata
, metadata
= msgstr
.split(b('\n')), {}
1746 for line
in raw_metadata
:
1747 tokens
= line
.split(b(':'), 1)
1748 if tokens
[0] != b(''):
1750 k
= tokens
[0].decode(encoding
)
1751 v
= tokens
[1].decode(encoding
)
1752 metadata
[k
] = v
.strip()
1755 self
.instance
.metadata
= metadata
1757 # test if we have a plural entry
1758 msgid_tokens
= msgid
.split(b('\0'))
1759 if len(msgid_tokens
) > 1:
1760 entry
= self
._build
_entry
(
1761 msgid
=msgid_tokens
[0],
1762 msgid_plural
=msgid_tokens
[1],
1763 msgstr_plural
=dict((k
, v
) for k
, v
in
1764 enumerate(msgstr
.split(b('\0'))))
1767 entry
= self
._build
_entry
(msgid
=msgid
, msgstr
=msgstr
)
1768 self
.instance
.append(entry
)
1770 self
.fhandle
.close()
1771 return self
.instance
1773 def _build_entry(self
, msgid
, msgstr
=None, msgid_plural
=None,
1774 msgstr_plural
=None):
1775 msgctxt_msgid
= msgid
.split(b('\x04'))
1776 encoding
= self
.instance
.encoding
1777 if len(msgctxt_msgid
) > 1:
1779 'msgctxt': msgctxt_msgid
[0].decode(encoding
),
1780 'msgid': msgctxt_msgid
[1].decode(encoding
),
1783 kwargs
= {'msgid': msgid
.decode(encoding
)}
1785 kwargs
['msgstr'] = msgstr
.decode(encoding
)
1787 kwargs
['msgid_plural'] = msgid_plural
.decode(encoding
)
1789 for k
in msgstr_plural
:
1790 msgstr_plural
[k
] = msgstr_plural
[k
].decode(encoding
)
1791 kwargs
['msgstr_plural'] = msgstr_plural
1792 return MOEntry(**kwargs
)
1794 def _readbinary(self
, fmt
, numbytes
):
1796 Private method that unpack n bytes of data using format <fmt>.
1797 It returns a tuple or a mixed value if the tuple length is 1.
1799 bytes
= self
.fhandle
.read(numbytes
)
1800 tup
= struct
.unpack(fmt
, bytes
)