2 # License: MIT (see extras/polib/LICENSE file provided)
3 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
6 **polib** allows you to manipulate, create, modify gettext files (pot, po and
7 mo files). You can load existing files, iterate through it's entries, add,
8 modify entries, comments or metadata, etc. or create new po files from scratch.
10 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
11 :func:`~polib.mofile` convenience functions.
25 __author__
= 'David Jean Louis <izimobil@gmail.com>'
41 # the default encoding to use when encoding cannot be detected
42 default_encoding
= 'utf-8'
44 # python 2/3 compatibility helpers {{{
52 return s
.encode('utf-8')
60 # _pofile_or_mofile {{{
63 def _pofile_or_mofile(f
, filetype
, **kwargs
):
65 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
66 honor the DRY concept.
68 # get the file encoding
69 enc
= kwargs
.get('encoding')
71 enc
= detect_encoding(f
, filetype
== 'mofile')
74 kls
= _POFileParser
if filetype
== 'pofile' else _MOFileParser
78 check_for_duplicates
=kwargs
.get('check_for_duplicates', False),
79 klass
=kwargs
.get('klass'),
81 instance
= parser
.parse()
82 instance
.wrapwidth
= kwargs
.get('wrapwidth', 78)
90 def _is_file(filename_or_contents
):
92 Safely returns the value of os.path.exists(filename_or_contents).
96 ``filename_or_contents``
97 either a filename, or a string holding the contents of some file.
98 In the latter case, this function will always return False.
101 return os
.path
.isfile(filename_or_contents
)
102 except (TypeError, ValueError, UnicodeEncodeError):
107 # function pofile() {{{
110 def pofile(pofile
, **kwargs
):
112 Convenience function that parses the po or pot file ``pofile`` and returns
113 a :class:`~polib.POFile` instance.
118 string, full or relative path to the po/pot file or its content (data).
121 integer, the wrap width, only useful when the ``-w`` option was passed
122 to xgettext (optional, default: ``78``).
125 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
126 encoding will be auto-detected).
128 ``check_for_duplicates``
129 whether to check for duplicate entries when adding entries to the
130 file (optional, default: ``False``).
133 class which is used to instantiate the return value (optional,
134 default: ``None``, the return value with be a :class:`~polib.POFile`
137 return _pofile_or_mofile(pofile
, 'pofile', **kwargs
)
141 # function mofile() {{{
144 def mofile(mofile
, **kwargs
):
146 Convenience function that parses the mo file ``mofile`` and returns a
147 :class:`~polib.MOFile` instance.
152 string, full or relative path to the mo file or its content (string
156 integer, the wrap width, only useful when the ``-w`` option was passed
157 to xgettext to generate the po file that was used to format the mo file
158 (optional, default: ``78``).
161 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
162 encoding will be auto-detected).
164 ``check_for_duplicates``
165 whether to check for duplicate entries when adding entries to the
166 file (optional, default: ``False``).
169 class which is used to instantiate the return value (optional,
170 default: ``None``, the return value with be a :class:`~polib.POFile`
173 return _pofile_or_mofile(mofile
, 'mofile', **kwargs
)
177 # function detect_encoding() {{{
180 def detect_encoding(file, binary_mode
=False):
182 Try to detect the encoding used by the ``file``. The ``file`` argument can
183 be a PO or MO file path or a string containing the contents of the file.
184 If the encoding cannot be detected, the function will return the value of
185 ``default_encoding``.
190 string, full or relative path to the po/mo file or its content.
193 boolean, set this to True if ``file`` is a mo file.
195 PATTERN
= r
'"?Content-Type:.+? charset=([\w_\-:\.]+)'
196 rxt
= re
.compile(u(PATTERN
))
197 rxb
= re
.compile(b(PATTERN
))
199 def charset_exists(charset
):
200 """Check whether ``charset`` is valid or not."""
202 codecs
.lookup(charset
)
207 if not _is_file(file):
209 match
= rxt
.search(file)
211 match
= rxb
.search(file)
213 enc
= match
.group(1).strip()
214 if not isinstance(enc
, text_type
):
215 enc
= enc
.decode('utf-8')
216 if charset_exists(enc
):
219 # For PY3, always treat as binary
220 if binary_mode
or PY3
:
227 for line
in f
.readlines():
228 match
= rx
.search(line
)
231 enc
= match
.group(1).strip()
232 if not isinstance(enc
, text_type
):
233 enc
= enc
.decode('utf-8')
234 if charset_exists(enc
):
237 return default_encoding
241 # function escape() {{{
246 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
247 the given string ``st`` and returns it.
250 st
.replace('\\', r
'\\')
251 .replace('\t', r
'\t')
252 .replace('\r', r
'\r')
253 .replace('\n', r
'\n')
259 # function unescape() {{{
264 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
265 the given string ``st`` and returns it.
268 def unescape_repl(m
):
278 return m
# handles escaped double quote
280 return re
.sub(r
'\\(\\|n|t|r|")', unescape_repl
, st
)
284 # function natural_sort() {{{
287 def natural_sort(lst
):
289 Sort naturally the given list.
290 Credits: http://stackoverflow.com/a/4836734
294 return int(text
) if text
.isdigit() else text
.lower()
296 def alphanum_key(key
):
297 return [convert(c
) for c
in re
.split('([0-9]+)', key
)]
299 return sorted(lst
, key
=alphanum_key
)
303 # class _BaseFile {{{
306 class _BaseFile(list):
308 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
309 classes. This class should **not** be instantiated directly.
312 def __init__(self
, *_args
, **kwargs
):
314 Constructor, accepts the following keyword arguments:
317 string, the path to the po or mo file, or its content as a string.
320 integer, the wrap width, only useful when the ``-w`` option was
321 passed to xgettext (optional, default: ``78``).
324 string, the encoding to use, defaults to ``default_encoding``
325 global variable (optional).
327 ``check_for_duplicates``
328 whether to check for duplicate entries when adding entries to the
329 file, (optional, default: ``False``).
332 # the opened file handle
333 pofile
= kwargs
.get('pofile', None)
334 if pofile
and _is_file(pofile
):
337 self
.fpath
= kwargs
.get('fpath')
338 # the width at which lines should be wrapped
339 self
.wrapwidth
= kwargs
.get('wrapwidth', 78)
341 self
.encoding
= kwargs
.get('encoding', default_encoding
)
342 # whether to check for duplicate entries or not
343 self
.check_for_duplicates
= kwargs
.get('check_for_duplicates', False)
346 # both po and mo files have metadata
348 self
.metadata_is_fuzzy
= 0
350 def __unicode__(self
):
352 Returns the unicode representation of the file.
355 entries
= [self
.metadata_as_entry()] + [e
for e
in self
if not e
.obsolete
]
356 for entry
in entries
:
357 ret
.append(entry
.__unicode
__(self
.wrapwidth
))
358 for entry
in self
.obsolete_entries():
359 ret
.append(entry
.__unicode
__(self
.wrapwidth
))
360 ret
= u('\n').join(ret
)
366 return self
.__unicode
__()
372 Returns the string representation of the file.
374 return compat
.ustr(self
).encode(self
.encoding
)
376 def __contains__(self
, entry
):
378 Overridden ``list`` method to implement the membership test (in and
380 The method considers that an entry is in the file if it finds an entry
381 that has the same msgid (the test is **case sensitive**) and the same
382 msgctxt (or none for both entries).
387 an instance of :class:`~polib._BaseEntry`.
389 return self
.find(entry
.msgid
, by
='msgid', msgctxt
=entry
.msgctxt
) is not None
391 def __eq__(self
, other
):
392 return str(self
) == str(other
)
395 return hash(str(self
))
397 def append(self
, entry
):
399 Overridden method to check for duplicates entries, if a user tries to
400 add an entry that is already in the file, the method will raise a
401 ``ValueError`` exception.
406 an instance of :class:`~polib._BaseEntry`.
408 # check_for_duplicates may not be defined (yet) when unpickling.
409 # But if pickling, we never want to check for duplicates anyway.
410 if getattr(self
, 'check_for_duplicates', False) and entry
in self
:
411 raise ValueError('Entry "%s" already exists' % entry
.msgid
)
412 super().append(entry
)
414 def insert(self
, index
, entry
):
416 Overridden method to check for duplicates entries, if a user tries to
417 add an entry that is already in the file, the method will raise a
418 ``ValueError`` exception.
423 index at which the entry should be inserted.
426 an instance of :class:`~polib._BaseEntry`.
428 if self
.check_for_duplicates
and entry
in self
:
429 raise ValueError('Entry "%s" already exists' % entry
.msgid
)
430 super().insert(index
, entry
)
432 def metadata_as_entry(self
):
434 Returns the file metadata as a :class:`~polib.POFile` instance.
436 e
= POEntry(msgid
='')
437 mdata
= self
.ordered_metadata()
440 for name
, value
in mdata
:
441 # Strip whitespace off each line in a multi-line entry
442 strs
.append(f
'{name}: {value}')
443 e
.msgstr
= '\n'.join(strs
) + '\n'
444 if self
.metadata_is_fuzzy
:
445 e
.flags
.append('fuzzy')
448 def save(self
, fpath
=None, repr_method
='__unicode__', newline
=None):
450 Saves the po file to ``fpath``.
451 If it is an existing file and no ``fpath`` is provided, then the
452 existing file is rewritten with the modified data.
457 string, full or relative path to the file.
460 string, the method to use for output.
463 string, controls how universal newlines works
465 if self
.fpath
is None and fpath
is None:
466 raise OSError('You must provide a file path to save() method')
467 contents
= getattr(self
, repr_method
)()
470 if repr_method
== 'to_binary':
471 fhandle
= open(fpath
, 'wb')
473 fhandle
= open(fpath
, 'w', encoding
=self
.encoding
, newline
=newline
)
474 if not isinstance(contents
, text_type
):
475 contents
= contents
.decode(self
.encoding
)
476 fhandle
.write(contents
)
478 # set the file path if not set
479 if self
.fpath
is None and fpath
:
482 def find(self
, st
, by
='msgid', include_obsolete_entries
=False, msgctxt
=False):
484 Find the entry which msgid (or property identified by the ``by``
485 argument) matches the string ``st``.
490 string, the string to search for.
493 string, the property to use for comparison (default: ``msgid``).
495 ``include_obsolete_entries``
496 boolean, whether to also search in entries that are obsolete.
499 string, allows specifying a specific message context for the
502 if include_obsolete_entries
:
505 entries
= [e
for e
in self
if not e
.obsolete
]
508 if getattr(e
, by
) == st
:
509 if msgctxt
is not False and e
.msgctxt
!= msgctxt
:
512 if len(matches
) == 1:
514 elif len(matches
) > 1:
516 # find the entry with no msgctx
523 # fallback to the first entry found
527 def ordered_metadata(self
):
529 Convenience method that returns an ordered version of the metadata
530 dictionary. The return value is list of tuples (metadata name,
533 # copy the dict first
534 metadata
= self
.metadata
.copy()
536 'Project-Id-Version',
537 'Report-Msgid-Bugs-To',
545 'Content-Transfer-Encoding',
549 for data
in data_order
:
551 value
= metadata
.pop(data
)
552 ordered_data
.append((data
, value
))
555 # the rest of the metadata will be alphabetically ordered since there
556 # are no specs for this AFAIK
557 for data
in natural_sort(metadata
.keys()):
558 value
= metadata
[data
]
559 ordered_data
.append((data
, value
))
564 Return the binary representation of the file.
567 entries
= self
.translated_entries()
569 # the keys are sorted in the .mo file
570 def cmp(_self
, other
):
571 # msgfmt compares entries with msgctxt if it exists
572 self_msgid
= _self
.msgctxt
or _self
.msgid
573 other_msgid
= other
.msgctxt
or other
.msgid
574 if self_msgid
> other_msgid
:
576 elif self_msgid
< other_msgid
:
582 entries
.sort(key
=lambda o
: o
.msgid_with_context
.encode('utf-8'))
583 mentry
= self
.metadata_as_entry()
584 entries
= [mentry
] + entries
585 entries_len
= len(entries
)
586 ids
, strs
= b(''), b('')
588 # For each string, we need size and file offset. Each string is
589 # NUL terminated; the NUL does not count into the size.
592 # Contexts are stored by storing the concatenation of the
593 # context, a <EOT> byte, and the original string
594 msgid
= self
._encode
(e
.msgctxt
+ '\4')
597 for index
in sorted(e
.msgstr_plural
.keys()):
598 msgstr
.append(e
.msgstr_plural
[index
])
599 msgid
+= self
._encode
(e
.msgid
+ '\0' + e
.msgid_plural
)
600 msgstr
= self
._encode
('\0'.join(msgstr
))
602 msgid
+= self
._encode
(e
.msgid
)
603 msgstr
= self
._encode
(e
.msgstr
)
604 offsets
.append((len(ids
), len(msgid
), len(strs
), len(msgstr
)))
605 ids
+= msgid
+ b('\0')
606 strs
+= msgstr
+ b('\0')
608 # The header is 7 32-bit unsigned integers.
609 keystart
= 7 * 4 + 16 * entries_len
610 # and the values start after the keys
611 valuestart
= keystart
+ len(ids
)
614 # The string table first has the list of keys, then the list of values.
615 # Each entry has first the size of the string, then the file offset.
616 for o1
, l1
, o2
, l2
in offsets
:
617 koffsets
+= [l1
, o1
+ keystart
]
618 voffsets
+= [l2
, o2
+ valuestart
]
619 offsets
= koffsets
+ voffsets
621 output
= struct
.pack(
631 # start of value index
632 7 * 4 + entries_len
* 8,
633 # size and offset of hash table, we don't use hash tables
637 if PY3
and sys
.version_info
.minor
> 1: # python 3.2 or newer
638 output
+= array
.array('i', offsets
).tobytes()
640 output
+= array
.array('i', offsets
).tostring()
645 def _encode(self
, mixed
):
647 Encodes the given ``mixed`` argument with the file encoding if and
648 only if it's an unicode string and returns the encoded string.
650 if isinstance(mixed
, text_type
):
651 mixed
= mixed
.encode(self
.encoding
)
659 class POFile(_BaseFile
):
661 Po (or Pot) file reader/writer.
662 This class inherits the :class:`~polib._BaseFile` class and, by extension,
663 the python ``list`` type.
666 def __unicode__(self
):
668 Returns the unicode representation of the po file.
670 ret
, headers
= '', self
.header
.split('\n')
671 for header
in headers
:
674 elif header
[:1] in [',', ':']:
675 ret
+= '#%s\n' % header
677 ret
+= '# %s\n' % header
679 if not isinstance(ret
, text_type
):
680 ret
= ret
.decode(self
.encoding
)
682 return ret
+ _BaseFile
.__unicode
__(self
)
684 def save_as_mofile(self
, fpath
):
686 Saves the binary representation of the file to given ``fpath``.
691 string, full or relative path to the mo file.
693 _BaseFile
.save(self
, fpath
, 'to_binary')
695 def percent_translated(self
):
697 Convenience method that returns the percentage of translated
700 total
= len([e
for e
in self
if not e
.obsolete
])
703 translated
= len(self
.translated_entries())
704 return int(translated
* 100 / float(total
))
706 def translated_entries(self
):
708 Convenience method that returns the list of translated entries.
710 return [e
for e
in self
if e
.translated()]
712 def untranslated_entries(self
):
714 Convenience method that returns the list of untranslated entries.
717 e
for e
in self
if not e
.translated() and not e
.obsolete
and not e
.fuzzy
720 def fuzzy_entries(self
):
722 Convenience method that returns the list of fuzzy entries.
724 return [e
for e
in self
if e
.fuzzy
and not e
.obsolete
]
726 def obsolete_entries(self
):
728 Convenience method that returns the list of obsolete entries.
730 return [e
for e
in self
if e
.obsolete
]
732 def merge(self
, refpot
):
734 Convenience method that merges the current pofile with the pot file
735 provided. It behaves exactly as the gettext msgmerge utility:
737 * comments of this file will be preserved, but extracted comments and
738 occurrences will be discarded;
739 * any translations or comments in the file will be discarded, however,
740 dot comments and file positions will be preserved;
741 * the fuzzy flags are preserved.
746 object POFile, the reference catalog.
748 # Store entries in dict/set for faster access
749 self_entries
= {entry
.msgid_with_context
: entry
for entry
in self
}
750 refpot_msgids
= {entry
.msgid_with_context
for entry
in refpot
}
751 # Merge entries that are in the refpot
753 e
= self_entries
.get(entry
.msgid_with_context
)
758 # ok, now we must "obsolete" entries that are not in the refpot anymore
760 if entry
.msgid_with_context
not in refpot_msgids
:
761 entry
.obsolete
= True
768 class MOFile(_BaseFile
):
770 Mo file reader/writer.
771 This class inherits the :class:`~polib._BaseFile` class and, by
772 extension, the python ``list`` type.
776 MAGIC_SWAPPED
= 0xDE120495
778 def __init__(self
, *args
, **kwargs
):
780 Constructor, accepts all keywords arguments accepted by
781 :class:`~polib._BaseFile` class.
783 _BaseFile
.__init
__(self
, *args
, **kwargs
)
784 self
.magic_number
= None
787 def save_as_pofile(self
, fpath
):
789 Saves the mofile as a pofile to ``fpath``.
794 string, full or relative path to the file.
796 _BaseFile
.save(self
, fpath
)
798 def save(self
, fpath
=None):
800 Saves the mofile to ``fpath``.
805 string, full or relative path to the file.
807 _BaseFile
.save(self
, fpath
, 'to_binary')
809 def percent_translated(self
):
811 Convenience method to keep the same interface with POFile instances.
815 def translated_entries(self
):
817 Convenience method to keep the same interface with POFile instances.
821 def untranslated_entries(self
):
823 Convenience method to keep the same interface with POFile instances.
827 def fuzzy_entries(self
):
829 Convenience method to keep the same interface with POFile instances.
833 def obsolete_entries(self
):
835 Convenience method to keep the same interface with POFile instances.
841 # class _BaseEntry {{{
846 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
847 This class should **not** be instantiated directly.
850 def __init__(self
, *_args
, **kwargs
):
852 Constructor, accepts the following keyword arguments:
855 string, the entry msgid.
858 string, the entry msgstr.
861 string, the entry msgid_plural.
864 dict, the entry msgstr_plural lines.
867 string, the entry context (msgctxt).
870 bool, whether the entry is "obsolete" or not.
873 string, the encoding to use, defaults to ``default_encoding``
874 global variable (optional).
876 self
.msgid
= kwargs
.get('msgid', '')
877 self
.msgstr
= kwargs
.get('msgstr', '')
878 self
.msgid_plural
= kwargs
.get('msgid_plural', '')
879 self
.msgstr_plural
= kwargs
.get('msgstr_plural', {})
880 self
.msgctxt
= kwargs
.get('msgctxt', None)
881 self
.obsolete
= kwargs
.get('obsolete', False)
882 self
.encoding
= kwargs
.get('encoding', default_encoding
)
884 def __unicode__(self
, wrapwidth
=78):
886 Returns the unicode representation of the entry.
893 # write the msgctxt if any
894 if self
.msgctxt
is not None:
895 ret
+= self
._str
_field
('msgctxt', delflag
, '', self
.msgctxt
, wrapwidth
)
897 ret
+= self
._str
_field
('msgid', delflag
, '', self
.msgid
, wrapwidth
)
898 # write the msgid_plural if any
899 if self
.msgid_plural
:
900 ret
+= self
._str
_field
(
901 'msgid_plural', delflag
, '', self
.msgid_plural
, wrapwidth
903 if self
.msgstr_plural
:
904 # write the msgstr_plural if any
905 msgstrs
= self
.msgstr_plural
909 msgstr
= msgstrs
[index
]
910 plural_index
= '[%s]' % index
911 ret
+= self
._str
_field
(
912 'msgstr', delflag
, plural_index
, msgstr
, wrapwidth
915 # otherwise write the msgstr
916 ret
+= self
._str
_field
('msgstr', delflag
, '', self
.msgstr
, wrapwidth
)
918 ret
= u('\n').join(ret
)
924 return self
.__unicode
__()
930 Returns the string representation of the entry.
932 return compat
.ustr(self
).encode(self
.encoding
)
934 def __eq__(self
, other
):
935 return str(self
) == str(other
)
938 return hash(str(self
))
940 def _str_field(self
, fieldname
, delflag
, plural_index
, field
, wrapwidth
=78):
941 lines
= field
.splitlines(True)
943 lines
= [''] + lines
# start with initial empty line
945 escaped_field
= escape(field
)
946 specialchars_count
= 0
947 for c
in ['\\', '\n', '\r', '\t', '"']:
948 specialchars_count
+= field
.count(c
)
949 # comparison must take into account fieldname length + one space
950 # + 2 quotes (eg. msgid "<string>")
951 flength
= len(fieldname
) + 3
953 flength
+= len(plural_index
)
954 real_wrapwidth
= wrapwidth
- flength
+ specialchars_count
955 if wrapwidth
> 0 and len(field
) > real_wrapwidth
:
956 # Wrap the line but take field name into account
959 for item
in textwrap
.wrap(
961 wrapwidth
- 2, # 2 for quotes ""
962 drop_whitespace
=False,
963 break_long_words
=False,
968 if fieldname
.startswith('previous_'):
969 # quick and dirty trick to get the real field name
970 fieldname
= fieldname
[9:]
972 ret
= [f
'{delflag}{fieldname}{plural_index} "{escape(lines.pop(0))}"']
974 ret
.append(f
'{delflag}"{escape(line)}"')
978 def msgid_with_context(self
):
980 return '{}{}{}'.format(self
.msgctxt
, '\x04', self
.msgid
)
988 class POEntry(_BaseEntry
):
990 Represents a po file entry.
993 def __init__(self
, *args
, **kwargs
):
995 Constructor, accepts the following keyword arguments:
998 string, the entry comment.
1001 string, the entry translator comment.
1004 list, the entry occurrences.
1007 list, the entry flags.
1009 ``previous_msgctxt``
1010 string, the entry previous context.
1013 string, the entry previous msgid.
1015 ``previous_msgid_plural``
1016 string, the entry previous msgid_plural.
1019 integer, the line number of the entry
1021 _BaseEntry
.__init
__(self
, *args
, **kwargs
)
1022 self
.comment
= kwargs
.get('comment', '')
1023 self
.tcomment
= kwargs
.get('tcomment', '')
1024 self
.occurrences
= kwargs
.get('occurrences', [])
1025 self
.flags
= kwargs
.get('flags', [])
1026 self
.previous_msgctxt
= kwargs
.get('previous_msgctxt', None)
1027 self
.previous_msgid
= kwargs
.get('previous_msgid', None)
1028 self
.previous_msgid_plural
= kwargs
.get('previous_msgid_plural', None)
1029 self
.linenum
= kwargs
.get('linenum', None)
1031 def __unicode__(self
, wrapwidth
=78):
1033 Returns the unicode representation of the entry.
1036 # comments first, if any (with text wrapping as xgettext does)
1038 comments
= [('tcomment', '# ')]
1040 comments
= [('comment', '#. '), ('tcomment', '# ')]
1042 val
= getattr(self
, c
[0])
1044 for comment
in val
.split('\n'):
1045 if len(comment
) + len(c
[1]) > wrapwidth
> 0:
1046 ret
+= textwrap
.wrap(
1049 initial_indent
=c
[1],
1050 subsequent_indent
=c
[1],
1051 break_long_words
=False,
1054 ret
.append(f
'{c[1]}{comment}')
1056 # occurrences (with text wrapping as xgettext does)
1057 if not self
.obsolete
and self
.occurrences
:
1059 for fpath
, lineno
in self
.occurrences
:
1061 filelist
.append(f
'{fpath}:{lineno}')
1063 filelist
.append(fpath
)
1064 filestr
= ' '.join(filelist
)
1065 if len(filestr
) + 3 > wrapwidth
> 0:
1066 # textwrap split words that contain hyphen, this is not
1067 # what we want for filenames, so the dirty hack is to
1068 # temporally replace hyphens with a char that a file cannot
1071 line
.replace('*', '-')
1072 for line
in textwrap
.wrap(
1073 filestr
.replace('-', '*'),
1075 initial_indent
='#: ',
1076 subsequent_indent
='#: ',
1077 break_long_words
=False,
1081 ret
.append('#: ' + filestr
)
1083 # flags (TODO: wrapping ?)
1085 ret
.append('#, %s' % ', '.join(self
.flags
))
1087 # previous context and previous msgid/msgid_plural
1088 fields
= ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
1094 val
= getattr(self
, f
)
1096 ret
+= self
._str
_field
(f
, prefix
, '', val
, wrapwidth
)
1098 ret
.append(_BaseEntry
.__unicode
__(self
, wrapwidth
))
1099 ret
= u('\n').join(ret
)
1102 def __cmp__(self
, other
):
1104 Called by comparison operations if rich comparison is not defined.
1106 # First: Obsolete test
1107 if self
.obsolete
!= other
.obsolete
:
1112 # Work on a copy to protect original
1113 occ1
= sorted(self
.occurrences
[:])
1114 occ2
= sorted(other
.occurrences
[:])
1120 msgctxt
= self
.msgctxt
or '0'
1121 othermsgctxt
= other
.msgctxt
or '0'
1122 if msgctxt
> othermsgctxt
:
1124 elif msgctxt
< othermsgctxt
:
1126 # Compare msgid_plural
1127 msgid_plural
= self
.msgid_plural
or '0'
1128 othermsgid_plural
= other
.msgid_plural
or '0'
1129 if msgid_plural
> othermsgid_plural
:
1131 elif msgid_plural
< othermsgid_plural
:
1133 # Compare msgstr_plural
1134 if self
.msgstr_plural
and isinstance(self
.msgstr_plural
, dict):
1135 msgstr_plural
= list(self
.msgstr_plural
.values())
1138 if other
.msgstr_plural
and isinstance(other
.msgstr_plural
, dict):
1139 othermsgstr_plural
= list(other
.msgstr_plural
.values())
1141 othermsgstr_plural
= []
1142 if msgstr_plural
> othermsgstr_plural
:
1144 elif msgstr_plural
< othermsgstr_plural
:
1147 if self
.msgid
> other
.msgid
:
1149 elif self
.msgid
< other
.msgid
:
1152 if self
.msgstr
> other
.msgstr
:
1154 elif self
.msgstr
< other
.msgstr
:
1158 def __gt__(self
, other
):
1159 return self
.__cmp
__(other
) > 0
1161 def __lt__(self
, other
):
1162 return self
.__cmp
__(other
) < 0
1164 def __ge__(self
, other
):
1165 return self
.__cmp
__(other
) >= 0
1167 def __le__(self
, other
):
1168 return self
.__cmp
__(other
) <= 0
1170 def __eq__(self
, other
):
1171 return self
.__cmp
__(other
) == 0
1173 def __ne__(self
, other
):
1174 return self
.__cmp
__(other
) != 0
1176 def translated(self
):
1178 Returns ``True`` if the entry has been translated or ``False``
1181 if self
.obsolete
or self
.fuzzy
:
1183 if self
.msgstr
!= '':
1185 if self
.msgstr_plural
:
1186 for pos
in self
.msgstr_plural
:
1187 if self
.msgstr_plural
[pos
] == '':
1192 def merge(self
, other
):
1194 Merge the current entry with the given pot entry.
1196 self
.msgid
= other
.msgid
1197 self
.msgctxt
= other
.msgctxt
1198 self
.occurrences
= other
.occurrences
1199 self
.comment
= other
.comment
1201 self
.flags
= other
.flags
[:] # clone flags
1203 self
.flags
.append('fuzzy')
1204 self
.msgid_plural
= other
.msgid_plural
1205 self
.obsolete
= other
.obsolete
1206 self
.previous_msgctxt
= other
.previous_msgctxt
1207 self
.previous_msgid
= other
.previous_msgid
1208 self
.previous_msgid_plural
= other
.previous_msgid_plural
1209 if other
.msgstr_plural
:
1210 for pos
in other
.msgstr_plural
:
1212 # keep existing translation at pos if any
1213 self
.msgstr_plural
[pos
]
1215 self
.msgstr_plural
[pos
] = ''
1219 return 'fuzzy' in self
.flags
1222 return hash((self
.msgid
, self
.msgstr
))
1229 class MOEntry(_BaseEntry
):
1231 Represents a mo file entry.
1234 def __init__(self
, *args
, **kwargs
):
1236 Constructor, accepts the following keyword arguments,
1237 for consistency with :class:`~polib.POEntry`:
1243 ``previous_msgctxt``
1245 ``previous_msgid_plural``
1247 Note: even though these keyword arguments are accepted,
1248 they hold no real meaning in the context of MO files
1249 and are simply ignored.
1251 _BaseEntry
.__init
__(self
, *args
, **kwargs
)
1254 self
.occurrences
= []
1256 self
.previous_msgctxt
= None
1257 self
.previous_msgid
= None
1258 self
.previous_msgid_plural
= None
1261 return hash((self
.msgid
, self
.msgstr
))
1265 # class _POFileParser {{{
1268 class _POFileParser
:
1270 A finite state machine to parse efficiently and correctly po
1274 def __init__(self
, pofile
, *_args
, **kwargs
):
1281 string, path to the po file or its content
1284 string, the encoding to use, defaults to ``default_encoding``
1285 global variable (optional).
1287 ``check_for_duplicates``
1288 whether to check for duplicate entries when adding entries to the
1289 file (optional, default: ``False``).
1291 enc
= kwargs
.get('encoding', default_encoding
)
1292 if _is_file(pofile
):
1294 self
.fhandle
= open(pofile
, encoding
=enc
)
1296 enc
= default_encoding
1297 self
.fhandle
= open(pofile
, encoding
=enc
)
1299 self
.fhandle
= pofile
.splitlines()
1301 klass
= kwargs
.get('klass')
1304 self
.instance
= klass(
1307 check_for_duplicates
=kwargs
.get('check_for_duplicates', False),
1309 self
.transitions
= {}
1310 self
.current_line
= 0
1311 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1312 self
.current_state
= 'st'
1313 self
.current_token
= None
1314 # two memo flags used in handlers
1315 self
.msgstr_index
= 0
1316 self
.entry_obsolete
= 0
1317 # Configure the state machine, by adding transitions.
1318 # Signification of symbols:
1319 # * ST: Beginning of the file (start)
1321 # * TC: a translation comment
1322 # * GC: a generated comment
1323 # * OC: a file/line occurrence
1324 # * FL: a flags line
1325 # * CT: a message context
1326 # * PC: a previous msgctxt
1327 # * PM: a previous msgid
1328 # * PP: a previous msgid_plural
1330 # * MP: a msgid plural
1332 # * MX: a msgstr plural
1333 # * MC: a msgid or msgstr continuation line
1351 self
.add('tc', ['st', 'he'], 'he')
1354 ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mp', 'mx', 'mi'],
1357 self
.add('gc', all
, 'gc')
1358 self
.add('oc', all
, 'oc')
1359 self
.add('fl', all
, 'fl')
1360 self
.add('pc', all
, 'pc')
1361 self
.add('pm', all
, 'pm')
1362 self
.add('pp', all
, 'pp')
1365 ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1370 ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1373 self
.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1374 self
.add('ms', ['mi', 'mp', 'tc'], 'ms')
1375 self
.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1376 self
.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1380 Run the state machine, parse the file line by line and call process()
1381 with the current matched symbol.
1388 'msgid_plural': 'mp',
1391 'msgid_plural': 'pp',
1396 fpath
= '%s ' % self
.instance
.fpath
if self
.instance
.fpath
else ''
1397 for line
in self
.fhandle
:
1398 self
.current_line
+= 1
1399 if self
.current_line
== 1:
1400 BOM
= codecs
.BOM_UTF8
.decode('utf-8')
1401 if line
.startswith(BOM
):
1402 line
= line
[len(BOM
) :]
1407 tokens
= line
.split(None, 2)
1408 nb_tokens
= len(tokens
)
1410 if tokens
[0] == '#~|':
1413 if tokens
[0] == '#~' and nb_tokens
> 1:
1414 line
= line
[3:].strip()
1417 self
.entry_obsolete
= 1
1419 self
.entry_obsolete
= 0
1421 # Take care of keywords like
1422 # msgid, msgid_plural, msgctxt & msgstr.
1423 if tokens
[0] in keywords
and nb_tokens
> 1:
1424 line
= line
[len(tokens
[0]) :].lstrip()
1425 if re
.search(r
'([^\\]|^)"', line
[1:-1]):
1427 'Syntax error in po file %s(line %s): '
1428 'unescaped double quote found' % (fpath
, self
.current_line
)
1430 self
.current_token
= line
1431 self
.process(keywords
[tokens
[0]])
1434 self
.current_token
= line
1436 if tokens
[0] == '#:':
1439 # we are on a occurrences line
1442 elif line
[:1] == '"':
1443 # we are on a continuation line
1444 if re
.search(r
'([^\\]|^)"', line
[1:-1]):
1446 'Syntax error in po file %s(line %s): '
1447 'unescaped double quote found' % (fpath
, self
.current_line
)
1451 elif line
[:7] == 'msgstr[':
1452 # we are on a msgstr plural
1455 elif tokens
[0] == '#,':
1458 # we are on a flags line
1461 elif tokens
[0] == '#' or tokens
[0].startswith('##'):
1464 # we are on a translator comment line
1467 elif tokens
[0] == '#.':
1470 # we are on a generated comment line
1473 elif tokens
[0] == '#|':
1476 'Syntax error in po file %s(line %s)'
1477 % (fpath
, self
.current_line
)
1480 # Remove the marker and any whitespace right after that.
1481 line
= line
[2:].lstrip()
1482 self
.current_token
= line
1484 if tokens
[1].startswith('"'):
1485 # Continuation of previous metadata.
1490 # Invalid continuation line.
1492 'Syntax error in po file %s(line %s): '
1493 'invalid continuation line' % (fpath
, self
.current_line
)
1496 # we are on a "previous translation" comment line,
1497 if tokens
[1] not in prev_keywords
:
1498 # Unknown keyword in previous translation comment.
1500 'Syntax error in po file %s(line %s): '
1501 'unknown keyword %s' % (fpath
, self
.current_line
, tokens
[1])
1504 # Remove the keyword and any whitespace
1505 # between it and the starting quote.
1506 line
= line
[len(tokens
[1]) :].lstrip()
1507 self
.current_token
= line
1508 self
.process(prev_keywords
[tokens
[1]])
1512 f
'Syntax error in po file {fpath}(line {self.current_line})'
1515 if self
.current_entry
and len(tokens
) > 0 and not tokens
[0].startswith('#'):
1516 # since entries are added when another entry is found, we must add
1517 # the last entry here (only if there are lines). Trailing comments
1519 self
.instance
.append(self
.current_entry
)
1521 # before returning the instance, check if there's metadata and if
1522 # so extract it in a dict
1523 metadataentry
= self
.instance
.find('')
1524 if metadataentry
: # metadata found
1526 self
.instance
.remove(metadataentry
)
1527 self
.instance
.metadata_is_fuzzy
= metadataentry
.flags
1529 for msg
in metadataentry
.msgstr
.splitlines():
1531 key
, val
= msg
.split(':', 1)
1532 self
.instance
.metadata
[key
] = val
.strip()
1533 except (ValueError, KeyError):
1535 self
.instance
.metadata
[key
] += '\n' + msg
.strip()
1537 if not isinstance(self
.fhandle
, list): # must be file
1538 self
.fhandle
.close()
1539 return self
.instance
1541 def add(self
, symbol
, states
, next_state
):
1543 Add a transition to the state machine.
1548 string, the matched token (two chars symbol).
1551 list, a list of states (two chars symbols).
1554 the next state the fsm will have after the action.
1556 for state
in states
:
1557 action
= getattr(self
, 'handle_%s' % next_state
)
1558 self
.transitions
[(symbol
, state
)] = (action
, next_state
)
1560 def process(self
, symbol
):
1562 Process the transition corresponding to the current state and the
1568 string, the matched token (two chars symbol).
1571 integer, the current line number of the parsed file.
1574 (action
, state
) = self
.transitions
[(symbol
, self
.current_state
)]
1576 self
.current_state
= state
1578 fpath
= '%s ' % self
.instance
.fpath
if self
.instance
.fpath
else ''
1579 if hasattr(self
.fhandle
, 'close'):
1580 self
.fhandle
.close()
1581 raise OSError(f
'Syntax error in po file {fpath}(line {self.current_line})')
1585 def handle_he(self
):
1586 """Handle a header comment."""
1587 if self
.instance
.header
!= '':
1588 self
.instance
.header
+= '\n'
1589 self
.instance
.header
+= self
.current_token
[2:]
1592 def handle_tc(self
):
1593 """Handle a translator comment."""
1594 if self
.current_state
in ['mc', 'ms', 'mx']:
1595 self
.instance
.append(self
.current_entry
)
1596 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1597 if self
.current_entry
.tcomment
!= '':
1598 self
.current_entry
.tcomment
+= '\n'
1599 tcomment
= self
.current_token
.lstrip('#')
1600 if tcomment
.startswith(' '):
1601 tcomment
= tcomment
[1:]
1602 self
.current_entry
.tcomment
+= tcomment
1605 def handle_gc(self
):
1606 """Handle a generated comment."""
1607 if self
.current_state
in ['mc', 'ms', 'mx']:
1608 self
.instance
.append(self
.current_entry
)
1609 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1610 if self
.current_entry
.comment
!= '':
1611 self
.current_entry
.comment
+= '\n'
1612 self
.current_entry
.comment
+= self
.current_token
[3:]
1615 def handle_oc(self
):
1616 """Handle a file:num occurrence."""
1617 if self
.current_state
in ['mc', 'ms', 'mx']:
1618 self
.instance
.append(self
.current_entry
)
1619 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1620 occurrences
= self
.current_token
[3:].split()
1621 for occurrence
in occurrences
:
1622 if occurrence
!= '':
1624 fil
, line
= occurrence
.rsplit(':', 1)
1625 if not line
.isdigit():
1628 self
.current_entry
.occurrences
.append((fil
, line
))
1629 except (ValueError, AttributeError):
1630 self
.current_entry
.occurrences
.append((occurrence
, ''))
1633 def handle_fl(self
):
1634 """Handle a flags line."""
1635 if self
.current_state
in ['mc', 'ms', 'mx']:
1636 self
.instance
.append(self
.current_entry
)
1637 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1638 self
.current_entry
.flags
+= [
1639 c
.strip() for c
in self
.current_token
[3:].split(',')
1643 def handle_pp(self
):
1644 """Handle a previous msgid_plural line."""
1645 if self
.current_state
in ['mc', 'ms', 'mx']:
1646 self
.instance
.append(self
.current_entry
)
1647 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1648 self
.current_entry
.previous_msgid_plural
= unescape(self
.current_token
[1:-1])
1651 def handle_pm(self
):
1652 """Handle a previous msgid line."""
1653 if self
.current_state
in ['mc', 'ms', 'mx']:
1654 self
.instance
.append(self
.current_entry
)
1655 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1656 self
.current_entry
.previous_msgid
= unescape(self
.current_token
[1:-1])
1659 def handle_pc(self
):
1660 """Handle a previous msgctxt line."""
1661 if self
.current_state
in ['mc', 'ms', 'mx']:
1662 self
.instance
.append(self
.current_entry
)
1663 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1664 self
.current_entry
.previous_msgctxt
= unescape(self
.current_token
[1:-1])
1667 def handle_ct(self
):
1668 """Handle a msgctxt."""
1669 if self
.current_state
in ['mc', 'ms', 'mx']:
1670 self
.instance
.append(self
.current_entry
)
1671 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1672 self
.current_entry
.msgctxt
= unescape(self
.current_token
[1:-1])
1675 def handle_mi(self
):
1676 """Handle a msgid."""
1677 if self
.current_state
in ['mc', 'ms', 'mx']:
1678 self
.instance
.append(self
.current_entry
)
1679 self
.current_entry
= POEntry(linenum
=self
.current_line
)
1680 self
.current_entry
.obsolete
= self
.entry_obsolete
1681 self
.current_entry
.msgid
= unescape(self
.current_token
[1:-1])
1684 def handle_mp(self
):
1685 """Handle a msgid plural."""
1686 self
.current_entry
.msgid_plural
= unescape(self
.current_token
[1:-1])
1689 def handle_ms(self
):
1690 """Handle a msgstr."""
1691 self
.current_entry
.msgstr
= unescape(self
.current_token
[1:-1])
1694 def handle_mx(self
):
1695 """Handle a msgstr plural."""
1696 index
= self
.current_token
[7]
1697 value
= self
.current_token
[self
.current_token
.find('"') + 1 : -1]
1698 self
.current_entry
.msgstr_plural
[int(index
)] = unescape(value
)
1699 self
.msgstr_index
= int(index
)
1702 def handle_mc(self
):
1703 """Handle a msgid or msgstr continuation line."""
1704 token
= unescape(self
.current_token
[1:-1])
1705 if self
.current_state
== 'ct':
1706 self
.current_entry
.msgctxt
+= token
1707 elif self
.current_state
== 'mi':
1708 self
.current_entry
.msgid
+= token
1709 elif self
.current_state
== 'mp':
1710 self
.current_entry
.msgid_plural
+= token
1711 elif self
.current_state
== 'ms':
1712 self
.current_entry
.msgstr
+= token
1713 elif self
.current_state
== 'mx':
1714 self
.current_entry
.msgstr_plural
[self
.msgstr_index
] += token
1715 elif self
.current_state
== 'pp':
1716 self
.current_entry
.previous_msgid_plural
+= token
1717 elif self
.current_state
== 'pm':
1718 self
.current_entry
.previous_msgid
+= token
1719 elif self
.current_state
== 'pc':
1720 self
.current_entry
.previous_msgctxt
+= token
1721 # don't change the current state
1726 # class _MOFileParser {{{
1729 class _MOFileParser
:
1731 A class to parse binary mo files.
1734 def __init__(self
, mofile
, *_args
, **kwargs
):
1741 string, path to the mo file or its content
1744 string, the encoding to use, defaults to ``default_encoding``
1745 global variable (optional).
1747 ``check_for_duplicates``
1748 whether to check for duplicate entries when adding entries to the
1749 file (optional, default: ``False``).
1751 if _is_file(mofile
):
1752 self
.fhandle
= open(mofile
, 'rb')
1754 self
.fhandle
= io
.BytesIO(mofile
)
1756 klass
= kwargs
.get('klass')
1759 self
.instance
= klass(
1761 encoding
=kwargs
.get('encoding', default_encoding
),
1762 check_for_duplicates
=kwargs
.get('check_for_duplicates', False),
1767 Make sure the file is closed, this prevents warnings on unclosed file
1768 when running tests with python >= 3.2.
1770 if self
.fhandle
and hasattr(self
.fhandle
, 'close'):
1771 self
.fhandle
.close()
1775 Build the instance with the file handle provided in the
1778 # parse magic number
1779 magic_number
= self
._readbinary
('<I', 4)
1780 if magic_number
== MOFile
.MAGIC
:
1782 elif magic_number
== MOFile
.MAGIC_SWAPPED
:
1785 raise OSError('Invalid mo file, magic number is incorrect !')
1786 self
.instance
.magic_number
= magic_number
1787 # parse the version number and the number of strings
1788 version
, numofstrings
= self
._readbinary
(ii
, 8)
1789 # from MO file format specs: "A program seeing an unexpected major
1790 # revision number should stop reading the MO file entirely"
1791 if version
>> 16 not in (0, 1):
1792 raise OSError('Invalid mo file, unexpected major revision number')
1793 self
.instance
.version
= version
1794 # original strings and translation strings hash table offset
1795 msgids_hash_offset
, msgstrs_hash_offset
= self
._readbinary
(ii
, 8)
1796 # move to msgid hash table and read length and offset of msgids
1797 self
.fhandle
.seek(msgids_hash_offset
)
1799 for i
in range(numofstrings
):
1800 msgids_index
.append(self
._readbinary
(ii
, 8))
1801 # move to msgstr hash table and read length and offset of msgstrs
1802 self
.fhandle
.seek(msgstrs_hash_offset
)
1804 for i
in range(numofstrings
):
1805 msgstrs_index
.append(self
._readbinary
(ii
, 8))
1807 encoding
= self
.instance
.encoding
1808 for i
in range(numofstrings
):
1809 self
.fhandle
.seek(msgids_index
[i
][1])
1810 msgid
= self
.fhandle
.read(msgids_index
[i
][0])
1812 self
.fhandle
.seek(msgstrs_index
[i
][1])
1813 msgstr
= self
.fhandle
.read(msgstrs_index
[i
][0])
1814 if i
== 0 and not msgid
: # metadata
1815 raw_metadata
, metadata
= msgstr
.split(b('\n')), {}
1816 for line
in raw_metadata
:
1817 tokens
= line
.split(b(':'), 1)
1818 if tokens
[0] != b(''):
1820 k
= tokens
[0].decode(encoding
)
1821 v
= tokens
[1].decode(encoding
)
1822 metadata
[k
] = v
.strip()
1825 self
.instance
.metadata
= metadata
1827 # test if we have a plural entry
1828 msgid_tokens
= msgid
.split(b('\0'))
1829 if len(msgid_tokens
) > 1:
1830 entry
= self
._build
_entry
(
1831 msgid
=msgid_tokens
[0],
1832 msgid_plural
=msgid_tokens
[1],
1833 msgstr_plural
=dict(enumerate(msgstr
.split(b('\x00')))),
1836 entry
= self
._build
_entry
(msgid
=msgid
, msgstr
=msgstr
)
1837 self
.instance
.append(entry
)
1839 self
.fhandle
.close()
1840 return self
.instance
1842 def _build_entry(self
, msgid
, msgstr
=None, msgid_plural
=None, msgstr_plural
=None):
1843 msgctxt_msgid
= msgid
.split(b('\x04'))
1844 encoding
= self
.instance
.encoding
1845 if len(msgctxt_msgid
) > 1:
1847 'msgctxt': msgctxt_msgid
[0].decode(encoding
),
1848 'msgid': msgctxt_msgid
[1].decode(encoding
),
1851 kwargs
= {'msgid': msgid
.decode(encoding
)}
1853 kwargs
['msgstr'] = msgstr
.decode(encoding
)
1855 kwargs
['msgid_plural'] = msgid_plural
.decode(encoding
)
1857 for k
in msgstr_plural
:
1858 msgstr_plural
[k
] = msgstr_plural
[k
].decode(encoding
)
1859 kwargs
['msgstr_plural'] = msgstr_plural
1860 return MOEntry(**kwargs
)
1862 def _readbinary(self
, fmt
, numbytes
):
1864 Private method that unpack n bytes of data using format <fmt>.
1865 It returns a tuple or a mixed value if the tuple length is 1.
1867 content
= self
.fhandle
.read(numbytes
)
1868 tup
= struct
.unpack(fmt
, content
)