tree-wide: trivial code style tweaks
[git-cola.git] / cola / polib.py
blob99fef395935502f6f35715610f96d6e8292de274
1 # -* coding: utf-8 -*-
3 # License: MIT (see extras/polib/LICENSE file provided)
4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
6 """
7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
8 mo files). You can load existing files, iterate through it's entries, add,
9 modify entries, comments or metadata, etc. or create new po files from scratch.
11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
12 :func:`~polib.mofile` convenience functions.
13 """
14 from __future__ import absolute_import, division, print_function
15 import array
16 import codecs
17 import os
18 import re
19 import struct
20 import sys
21 import textwrap
22 import io
24 from . import compat
27 __author__ = 'David Jean Louis <izimobil@gmail.com>'
28 __version__ = '1.1.1'
29 __all__ = [
30 'pofile',
31 'POFile',
32 'POEntry',
33 'mofile',
34 'MOFile',
35 'MOEntry',
36 'default_encoding',
37 'escape',
38 'unescape',
39 'detect_encoding',
43 # the default encoding to use when encoding cannot be detected
44 default_encoding = 'utf-8'
46 # python 2/3 compatibility helpers {{{
49 if sys.version_info < (3,):
50 PY3 = False
51 text_type = compat.ustr
53 def b(s):
54 return s
56 def u(s):
57 return compat.ustr(s, "unicode_escape")
59 else:
60 PY3 = True
61 text_type = str
63 def b(s):
64 return s.encode("utf-8")
66 def u(s):
67 return s
70 # }}}
71 # _pofile_or_mofile {{{
74 def _pofile_or_mofile(f, filetype, **kwargs):
75 """
76 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
77 honor the DRY concept.
78 """
79 # get the file encoding
80 enc = kwargs.get('encoding')
81 if enc is None:
82 enc = detect_encoding(f, filetype == 'mofile')
84 # parse the file
85 kls = _POFileParser if filetype == 'pofile' else _MOFileParser
86 parser = kls(
88 encoding=enc,
89 check_for_duplicates=kwargs.get('check_for_duplicates', False),
90 klass=kwargs.get('klass'),
92 instance = parser.parse()
93 instance.wrapwidth = kwargs.get('wrapwidth', 78)
94 return instance
97 # }}}
98 # _is_file {{{
101 def _is_file(filename_or_contents):
103 Safely returns the value of os.path.exists(filename_or_contents).
105 Arguments:
107 ``filename_or_contents``
108 either a filename, or a string holding the contents of some file.
109 In the latter case, this function will always return False.
111 try:
112 return os.path.isfile(filename_or_contents)
113 except (TypeError, ValueError, UnicodeEncodeError):
114 return False
117 # }}}
118 # function pofile() {{{
121 # pylint: disable=redefined-outer-name
122 def pofile(pofile, **kwargs):
124 Convenience function that parses the po or pot file ``pofile`` and returns
125 a :class:`~polib.POFile` instance.
127 Arguments:
129 ``pofile``
130 string, full or relative path to the po/pot file or its content (data).
132 ``wrapwidth``
133 integer, the wrap width, only useful when the ``-w`` option was passed
134 to xgettext (optional, default: ``78``).
136 ``encoding``
137 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
138 encoding will be auto-detected).
140 ``check_for_duplicates``
141 whether to check for duplicate entries when adding entries to the
142 file (optional, default: ``False``).
144 ``klass``
145 class which is used to instantiate the return value (optional,
146 default: ``None``, the return value with be a :class:`~polib.POFile`
147 instance).
149 return _pofile_or_mofile(pofile, 'pofile', **kwargs)
152 # }}}
153 # function mofile() {{{
156 # pylint: disable=redefined-outer-name
157 def mofile(mofile, **kwargs):
159 Convenience function that parses the mo file ``mofile`` and returns a
160 :class:`~polib.MOFile` instance.
162 Arguments:
164 ``mofile``
165 string, full or relative path to the mo file or its content (string
166 or bytes).
168 ``wrapwidth``
169 integer, the wrap width, only useful when the ``-w`` option was passed
170 to xgettext to generate the po file that was used to format the mo file
171 (optional, default: ``78``).
173 ``encoding``
174 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
175 encoding will be auto-detected).
177 ``check_for_duplicates``
178 whether to check for duplicate entries when adding entries to the
179 file (optional, default: ``False``).
181 ``klass``
182 class which is used to instantiate the return value (optional,
183 default: ``None``, the return value with be a :class:`~polib.POFile`
184 instance).
186 return _pofile_or_mofile(mofile, 'mofile', **kwargs)
189 # }}}
190 # function detect_encoding() {{{
193 def detect_encoding(file, binary_mode=False):
195 Try to detect the encoding used by the ``file``. The ``file`` argument can
196 be a PO or MO file path or a string containing the contents of the file.
197 If the encoding cannot be detected, the function will return the value of
198 ``default_encoding``.
200 Arguments:
202 ``file``
203 string, full or relative path to the po/mo file or its content.
205 ``binary_mode``
206 boolean, set this to True if ``file`` is a mo file.
208 PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
209 rxt = re.compile(u(PATTERN))
210 rxb = re.compile(b(PATTERN))
212 def charset_exists(charset):
213 """Check whether ``charset`` is valid or not."""
214 try:
215 codecs.lookup(charset)
216 except LookupError:
217 return False
218 return True
220 if not _is_file(file):
221 try:
222 match = rxt.search(file)
223 except TypeError:
224 match = rxb.search(file)
225 if match:
226 enc = match.group(1).strip()
227 if not isinstance(enc, text_type):
228 enc = enc.decode('utf-8')
229 if charset_exists(enc):
230 return enc
231 else:
232 # For PY3, always treat as binary
233 if binary_mode or PY3:
234 mode = 'rb'
235 rx = rxb
236 else:
237 mode = 'r'
238 rx = rxt
239 f = open(file, mode)
240 for line in f.readlines():
241 match = rx.search(line)
242 if match:
243 f.close()
244 enc = match.group(1).strip()
245 if not isinstance(enc, text_type):
246 enc = enc.decode('utf-8')
247 if charset_exists(enc):
248 return enc
249 f.close()
250 return default_encoding
253 # }}}
254 # function escape() {{{
257 def escape(st):
259 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
260 the given string ``st`` and returns it.
262 return (
263 st.replace('\\', r'\\')
264 .replace('\t', r'\t')
265 .replace('\r', r'\r')
266 .replace('\n', r'\n')
267 .replace('\"', r'\"')
271 # }}}
272 # function unescape() {{{
275 def unescape(st):
277 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
278 the given string ``st`` and returns it.
281 def unescape_repl(m):
282 m = m.group(1)
283 if m == 'n':
284 return '\n'
285 if m == 't':
286 return '\t'
287 if m == 'r':
288 return '\r'
289 if m == '\\':
290 return '\\'
291 return m # handles escaped double quote
293 return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
296 # }}}
297 # function natural_sort() {{{
300 def natural_sort(lst):
302 Sort naturally the given list.
303 Credits: http://stackoverflow.com/a/4836734
306 def convert(text):
307 return int(text) if text.isdigit() else text.lower()
309 def alphanum_key(key):
310 return [convert(c) for c in re.split('([0-9]+)', key)]
312 return sorted(lst, key=alphanum_key)
315 # }}}
316 # class _BaseFile {{{
319 class _BaseFile(list):
321 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
322 classes. This class should **not** be instantiated directly.
325 def __init__(self, *_args, **kwargs):
327 Constructor, accepts the following keyword arguments:
329 ``pofile``
330 string, the path to the po or mo file, or its content as a string.
332 ``wrapwidth``
333 integer, the wrap width, only useful when the ``-w`` option was
334 passed to xgettext (optional, default: ``78``).
336 ``encoding``
337 string, the encoding to use, defaults to ``default_encoding``
338 global variable (optional).
340 ``check_for_duplicates``
341 whether to check for duplicate entries when adding entries to the
342 file, (optional, default: ``False``).
344 list.__init__(self)
345 # the opened file handle
346 pofile = kwargs.get('pofile', None) # pylint: disable=redefined-outer-name
347 if pofile and _is_file(pofile):
348 self.fpath = pofile
349 else:
350 self.fpath = kwargs.get('fpath')
351 # the width at which lines should be wrapped
352 self.wrapwidth = kwargs.get('wrapwidth', 78)
353 # the file encoding
354 self.encoding = kwargs.get('encoding', default_encoding)
355 # whether to check for duplicate entries or not
356 self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
357 # header
358 self.header = ''
359 # both po and mo files have metadata
360 self.metadata = {}
361 self.metadata_is_fuzzy = 0
363 def __unicode__(self):
365 Returns the unicode representation of the file.
367 ret = []
368 entries = [self.metadata_as_entry()] + [e for e in self if not e.obsolete]
369 for entry in entries:
370 ret.append(entry.__unicode__(self.wrapwidth))
371 for entry in self.obsolete_entries(): # pylint: disable=no-member
372 ret.append(entry.__unicode__(self.wrapwidth))
373 ret = u('\n').join(ret)
374 return ret
376 if PY3:
378 def __str__(self):
379 return self.__unicode__()
381 else:
383 def __str__(self):
385 Returns the string representation of the file.
387 return compat.ustr(self).encode(self.encoding)
389 def __contains__(self, entry):
391 Overridden ``list`` method to implement the membership test (in and
392 not in).
393 The method considers that an entry is in the file if it finds an entry
394 that has the same msgid (the test is **case sensitive**) and the same
395 msgctxt (or none for both entries).
397 Argument:
399 ``entry``
400 an instance of :class:`~polib._BaseEntry`.
402 return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) is not None
404 def __eq__(self, other):
405 return str(self) == str(other)
407 def __hash__(self):
408 return hash(str(self))
410 def append(self, entry):
412 Overridden method to check for duplicates entries, if a user tries to
413 add an entry that is already in the file, the method will raise a
414 ``ValueError`` exception.
416 Argument:
418 ``entry``
419 an instance of :class:`~polib._BaseEntry`.
421 # check_for_duplicates may not be defined (yet) when unpickling.
422 # But if pickling, we never want to check for duplicates anyway.
423 if getattr(self, 'check_for_duplicates', False) and entry in self:
424 raise ValueError('Entry "%s" already exists' % entry.msgid)
425 super(_BaseFile, self).append(entry)
427 def insert(self, index, entry):
429 Overridden method to check for duplicates entries, if a user tries to
430 add an entry that is already in the file, the method will raise a
431 ``ValueError`` exception.
433 Arguments:
435 ``index``
436 index at which the entry should be inserted.
438 ``entry``
439 an instance of :class:`~polib._BaseEntry`.
441 if self.check_for_duplicates and entry in self:
442 raise ValueError('Entry "%s" already exists' % entry.msgid)
443 super(_BaseFile, self).insert(index, entry)
445 def metadata_as_entry(self):
447 Returns the file metadata as a :class:`~polib.POFile` instance.
449 e = POEntry(msgid='')
450 mdata = self.ordered_metadata()
451 if mdata:
452 strs = []
453 for name, value in mdata:
454 # Strip whitespace off each line in a multi-line entry
455 strs.append('%s: %s' % (name, value))
456 e.msgstr = '\n'.join(strs) + '\n'
457 if self.metadata_is_fuzzy:
458 e.flags.append('fuzzy')
459 return e
461 def save(self, fpath=None, repr_method='__unicode__', newline=None):
463 Saves the po file to ``fpath``.
464 If it is an existing file and no ``fpath`` is provided, then the
465 existing file is rewritten with the modified data.
467 Keyword arguments:
469 ``fpath``
470 string, full or relative path to the file.
472 ``repr_method``
473 string, the method to use for output.
475 ``newline``
476 string, controls how universal newlines works
478 if self.fpath is None and fpath is None:
479 raise IOError('You must provide a file path to save() method')
480 contents = getattr(self, repr_method)()
481 if fpath is None:
482 fpath = self.fpath
483 if repr_method == 'to_binary':
484 fhandle = open(fpath, 'wb')
485 else:
486 fhandle = io.open(fpath, 'w', encoding=self.encoding, newline=newline)
487 if not isinstance(contents, text_type):
488 contents = contents.decode(self.encoding)
489 fhandle.write(contents)
490 fhandle.close()
491 # set the file path if not set
492 if self.fpath is None and fpath:
493 self.fpath = fpath
495 def find(self, st, by='msgid', include_obsolete_entries=False, msgctxt=False):
497 Find the entry which msgid (or property identified by the ``by``
498 argument) matches the string ``st``.
500 Keyword arguments:
502 ``st``
503 string, the string to search for.
505 ``by``
506 string, the property to use for comparison (default: ``msgid``).
508 ``include_obsolete_entries``
509 boolean, whether to also search in entries that are obsolete.
511 ``msgctxt``
512 string, allows specifying a specific message context for the
513 search.
515 if include_obsolete_entries:
516 entries = self[:]
517 else:
518 entries = [e for e in self if not e.obsolete]
519 matches = []
520 for e in entries:
521 if getattr(e, by) == st:
522 if msgctxt is not False and e.msgctxt != msgctxt:
523 continue
524 matches.append(e)
525 if len(matches) == 1:
526 return matches[0]
527 elif len(matches) > 1:
528 if not msgctxt:
529 # find the entry with no msgctx
530 e = None
531 for m in matches:
532 if not m.msgctxt:
533 e = m
534 if e:
535 return e
536 # fallback to the first entry found
537 return matches[0]
538 return None
540 def ordered_metadata(self):
542 Convenience method that returns an ordered version of the metadata
543 dictionary. The return value is list of tuples (metadata name,
544 metadata_value).
546 # copy the dict first
547 metadata = self.metadata.copy()
548 data_order = [
549 'Project-Id-Version',
550 'Report-Msgid-Bugs-To',
551 'POT-Creation-Date',
552 'PO-Revision-Date',
553 'Last-Translator',
554 'Language-Team',
555 'Language',
556 'MIME-Version',
557 'Content-Type',
558 'Content-Transfer-Encoding',
559 'Plural-Forms',
561 ordered_data = []
562 for data in data_order:
563 try:
564 value = metadata.pop(data)
565 ordered_data.append((data, value))
566 except KeyError:
567 pass
568 # the rest of the metadata will be alphabetically ordered since there
569 # are no specs for this AFAIK
570 for data in natural_sort(metadata.keys()):
571 value = metadata[data]
572 ordered_data.append((data, value))
573 return ordered_data
575 def to_binary(self):
577 Return the binary representation of the file.
579 offsets = []
580 entries = self.translated_entries() # pylint: disable=no-member
582 # the keys are sorted in the .mo file
583 def cmp(_self, other): # pylint: disable=unused-variable
584 # msgfmt compares entries with msgctxt if it exists
585 self_msgid = _self.msgctxt or _self.msgid
586 other_msgid = other.msgctxt or other.msgid
587 if self_msgid > other_msgid:
588 return 1
589 elif self_msgid < other_msgid:
590 return -1
591 else:
592 return 0
594 # add metadata entry
595 entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
596 mentry = self.metadata_as_entry()
597 entries = [mentry] + entries
598 entries_len = len(entries)
599 ids, strs = b(''), b('')
600 for e in entries:
601 # For each string, we need size and file offset. Each string is
602 # NUL terminated; the NUL does not count into the size.
603 msgid = b('')
604 if e.msgctxt:
605 # Contexts are stored by storing the concatenation of the
606 # context, a <EOT> byte, and the original string
607 msgid = self._encode(e.msgctxt + '\4')
608 if e.msgid_plural:
609 msgstr = []
610 for index in sorted(e.msgstr_plural.keys()):
611 msgstr.append(e.msgstr_plural[index])
612 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
613 msgstr = self._encode('\0'.join(msgstr))
614 else:
615 msgid += self._encode(e.msgid)
616 msgstr = self._encode(e.msgstr)
617 offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
618 ids += msgid + b('\0')
619 strs += msgstr + b('\0')
621 # The header is 7 32-bit unsigned integers.
622 keystart = 7 * 4 + 16 * entries_len
623 # and the values start after the keys
624 valuestart = keystart + len(ids)
625 koffsets = []
626 voffsets = []
627 # The string table first has the list of keys, then the list of values.
628 # Each entry has first the size of the string, then the file offset.
629 for o1, l1, o2, l2 in offsets:
630 koffsets += [l1, o1 + keystart]
631 voffsets += [l2, o2 + valuestart]
632 offsets = koffsets + voffsets
634 output = struct.pack(
635 "Iiiiiii",
636 # Magic number
637 MOFile.MAGIC,
638 # Version
640 # number of entries
641 entries_len,
642 # start of key index
643 7 * 4,
644 # start of value index
645 7 * 4 + entries_len * 8,
646 # size and offset of hash table, we don't use hash tables
648 keystart,
650 if PY3 and sys.version_info.minor > 1: # python 3.2 or superior
651 output += array.array("i", offsets).tobytes()
652 else:
653 output += array.array("i", offsets).tostring() # pylint: disable=no-member
654 output += ids
655 output += strs
656 return output
658 def _encode(self, mixed):
660 Encodes the given ``mixed`` argument with the file encoding if and
661 only if it's an unicode string and returns the encoded string.
663 if isinstance(mixed, text_type):
664 mixed = mixed.encode(self.encoding)
665 return mixed
668 # }}}
669 # class POFile {{{
672 class POFile(_BaseFile):
674 Po (or Pot) file reader/writer.
675 This class inherits the :class:`~polib._BaseFile` class and, by extension,
676 the python ``list`` type.
679 def __unicode__(self):
681 Returns the unicode representation of the po file.
683 ret, headers = '', self.header.split('\n')
684 for header in headers:
685 if not header:
686 ret += "#\n"
687 elif header[:1] in [',', ':']:
688 ret += '#%s\n' % header
689 else:
690 ret += '# %s\n' % header
692 if not isinstance(ret, text_type):
693 ret = ret.decode(self.encoding)
695 return ret + _BaseFile.__unicode__(self)
697 def save_as_mofile(self, fpath):
699 Saves the binary representation of the file to given ``fpath``.
701 Keyword argument:
703 ``fpath``
704 string, full or relative path to the mo file.
706 _BaseFile.save(self, fpath, 'to_binary')
708 def percent_translated(self):
710 Convenience method that returns the percentage of translated
711 messages.
713 total = len([e for e in self if not e.obsolete])
714 if total == 0:
715 return 100
716 translated = len(self.translated_entries())
717 return int(translated * 100 / float(total))
719 def translated_entries(self):
721 Convenience method that returns the list of translated entries.
723 return [e for e in self if e.translated()]
725 def untranslated_entries(self):
727 Convenience method that returns the list of untranslated entries.
729 return [
730 e for e in self if not e.translated() and not e.obsolete and not e.fuzzy
733 def fuzzy_entries(self):
735 Convenience method that returns the list of fuzzy entries.
737 return [e for e in self if e.fuzzy and not e.obsolete]
739 def obsolete_entries(self):
741 Convenience method that returns the list of obsolete entries.
743 return [e for e in self if e.obsolete]
745 def merge(self, refpot):
747 Convenience method that merges the current pofile with the pot file
748 provided. It behaves exactly as the gettext msgmerge utility:
750 * comments of this file will be preserved, but extracted comments and
751 occurrences will be discarded;
752 * any translations or comments in the file will be discarded, however,
753 dot comments and file positions will be preserved;
754 * the fuzzy flags are preserved.
756 Keyword argument:
758 ``refpot``
759 object POFile, the reference catalog.
761 # Store entries in dict/set for faster access
762 self_entries = dict((entry.msgid_with_context, entry) for entry in self)
763 refpot_msgids = set(entry.msgid_with_context for entry in refpot)
764 # Merge entries that are in the refpot
765 for entry in refpot:
766 e = self_entries.get(entry.msgid_with_context)
767 if e is None:
768 e = POEntry()
769 self.append(e)
770 e.merge(entry)
771 # ok, now we must "obsolete" entries that are not in the refpot anymore
772 for entry in self:
773 if entry.msgid_with_context not in refpot_msgids:
774 entry.obsolete = True
777 # }}}
778 # class MOFile {{{
781 class MOFile(_BaseFile):
783 Mo file reader/writer.
784 This class inherits the :class:`~polib._BaseFile` class and, by
785 extension, the python ``list`` type.
788 MAGIC = 0x950412DE
789 MAGIC_SWAPPED = 0xDE120495
791 def __init__(self, *args, **kwargs):
793 Constructor, accepts all keywords arguments accepted by
794 :class:`~polib._BaseFile` class.
796 _BaseFile.__init__(self, *args, **kwargs)
797 self.magic_number = None
798 self.version = 0
800 def save_as_pofile(self, fpath):
802 Saves the mofile as a pofile to ``fpath``.
804 Keyword argument:
806 ``fpath``
807 string, full or relative path to the file.
809 _BaseFile.save(self, fpath)
811 # pylint: disable=no-self-use,arguments-differ
812 def save(self, fpath=None):
814 Saves the mofile to ``fpath``.
816 Keyword argument:
818 ``fpath``
819 string, full or relative path to the file.
821 _BaseFile.save(self, fpath, 'to_binary')
823 # pylint: disable=no-self-use
824 def percent_translated(self):
826 Convenience method to keep the same interface with POFile instances.
828 return 100
830 # pylint: disable=no-self-use
831 def translated_entries(self):
833 Convenience method to keep the same interface with POFile instances.
835 return self
837 # pylint: disable=no-self-use
838 def untranslated_entries(self):
840 Convenience method to keep the same interface with POFile instances.
842 return []
844 # pylint: disable=no-self-use
845 def fuzzy_entries(self):
847 Convenience method to keep the same interface with POFile instances.
849 return []
851 # pylint: disable=no-self-use
852 def obsolete_entries(self):
854 Convenience method to keep the same interface with POFile instances.
856 return []
859 # }}}
860 # class _BaseEntry {{{
863 class _BaseEntry(object):
865 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
866 This class should **not** be instantiated directly.
869 def __init__(self, *_args, **kwargs):
871 Constructor, accepts the following keyword arguments:
873 ``msgid``
874 string, the entry msgid.
876 ``msgstr``
877 string, the entry msgstr.
879 ``msgid_plural``
880 string, the entry msgid_plural.
882 ``msgstr_plural``
883 dict, the entry msgstr_plural lines.
885 ``msgctxt``
886 string, the entry context (msgctxt).
888 ``obsolete``
889 bool, whether the entry is "obsolete" or not.
891 ``encoding``
892 string, the encoding to use, defaults to ``default_encoding``
893 global variable (optional).
895 self.msgid = kwargs.get('msgid', '')
896 self.msgstr = kwargs.get('msgstr', '')
897 self.msgid_plural = kwargs.get('msgid_plural', '')
898 self.msgstr_plural = kwargs.get('msgstr_plural', {})
899 self.msgctxt = kwargs.get('msgctxt', None)
900 self.obsolete = kwargs.get('obsolete', False)
901 self.encoding = kwargs.get('encoding', default_encoding)
903 def __unicode__(self, wrapwidth=78):
905 Returns the unicode representation of the entry.
907 if self.obsolete:
908 delflag = '#~ '
909 else:
910 delflag = ''
911 ret = []
912 # write the msgctxt if any
913 if self.msgctxt is not None:
914 ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth)
915 # write the msgid
916 ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
917 # write the msgid_plural if any
918 if self.msgid_plural:
919 ret += self._str_field(
920 "msgid_plural", delflag, "", self.msgid_plural, wrapwidth
922 if self.msgstr_plural:
923 # write the msgstr_plural if any
924 msgstrs = self.msgstr_plural
925 keys = list(msgstrs)
926 keys.sort()
927 for index in keys:
928 msgstr = msgstrs[index]
929 plural_index = '[%s]' % index
930 ret += self._str_field(
931 "msgstr", delflag, plural_index, msgstr, wrapwidth
933 else:
934 # otherwise write the msgstr
935 ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth)
936 ret.append('')
937 ret = u('\n').join(ret)
938 return ret
940 if PY3:
942 def __str__(self):
943 return self.__unicode__()
945 else:
947 def __str__(self):
949 Returns the string representation of the entry.
951 return compat.ustr(self).encode(self.encoding)
953 def __eq__(self, other):
954 return str(self) == str(other)
956 def __hash__(self):
957 return hash(str(self))
959 # pylint: disable=no-self-use
960 def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78):
961 lines = field.splitlines(True)
962 if len(lines) > 1:
963 lines = [''] + lines # start with initial empty line
964 else:
965 escaped_field = escape(field)
966 specialchars_count = 0
967 for c in ['\\', '\n', '\r', '\t', '"']:
968 specialchars_count += field.count(c)
969 # comparison must take into account fieldname length + one space
970 # + 2 quotes (eg. msgid "<string>")
971 flength = len(fieldname) + 3
972 if plural_index:
973 flength += len(plural_index)
974 real_wrapwidth = wrapwidth - flength + specialchars_count
975 if wrapwidth > 0 and len(field) > real_wrapwidth:
976 # Wrap the line but take field name into account
977 lines = [''] + [
978 unescape(item)
979 for item in textwrap.wrap(
980 escaped_field,
981 wrapwidth - 2, # 2 for quotes ""
982 drop_whitespace=False,
983 break_long_words=False,
986 else:
987 lines = [field]
988 if fieldname.startswith('previous_'):
989 # quick and dirty trick to get the real field name
990 fieldname = fieldname[9:]
992 ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, escape(lines.pop(0)))]
993 for line in lines:
994 ret.append('%s"%s"' % (delflag, escape(line)))
995 return ret
997 @property
998 def msgid_with_context(self):
999 if self.msgctxt:
1000 return '%s%s%s' % (self.msgctxt, "\x04", self.msgid)
1001 return self.msgid
1004 # }}}
1005 # class POEntry {{{
1008 class POEntry(_BaseEntry):
1010 Represents a po file entry.
1013 def __init__(self, *args, **kwargs):
1015 Constructor, accepts the following keyword arguments:
1017 ``comment``
1018 string, the entry comment.
1020 ``tcomment``
1021 string, the entry translator comment.
1023 ``occurrences``
1024 list, the entry occurrences.
1026 ``flags``
1027 list, the entry flags.
1029 ``previous_msgctxt``
1030 string, the entry previous context.
1032 ``previous_msgid``
1033 string, the entry previous msgid.
1035 ``previous_msgid_plural``
1036 string, the entry previous msgid_plural.
1038 ``linenum``
1039 integer, the line number of the entry
1041 _BaseEntry.__init__(self, *args, **kwargs)
1042 self.comment = kwargs.get('comment', '')
1043 self.tcomment = kwargs.get('tcomment', '')
1044 self.occurrences = kwargs.get('occurrences', [])
1045 self.flags = kwargs.get('flags', [])
1046 self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
1047 self.previous_msgid = kwargs.get('previous_msgid', None)
1048 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
1049 self.linenum = kwargs.get('linenum', None)
1051 def __unicode__(self, wrapwidth=78):
1053 Returns the unicode representation of the entry.
1055 ret = []
1056 # comments first, if any (with text wrapping as xgettext does)
1057 if self.obsolete:
1058 comments = [('tcomment', '# ')]
1059 else:
1060 comments = [('comment', '#. '), ('tcomment', '# ')]
1061 for c in comments:
1062 val = getattr(self, c[0])
1063 if val:
1064 for comment in val.split('\n'):
1065 if len(comment) + len(c[1]) > wrapwidth > 0:
1066 ret += textwrap.wrap(
1067 comment,
1068 wrapwidth,
1069 initial_indent=c[1],
1070 subsequent_indent=c[1],
1071 break_long_words=False,
1073 else:
1074 ret.append('%s%s' % (c[1], comment))
1076 # occurrences (with text wrapping as xgettext does)
1077 if not self.obsolete and self.occurrences:
1078 filelist = []
1079 for fpath, lineno in self.occurrences:
1080 if lineno:
1081 filelist.append('%s:%s' % (fpath, lineno))
1082 else:
1083 filelist.append(fpath)
1084 filestr = ' '.join(filelist)
1085 if len(filestr) + 3 > wrapwidth > 0:
1086 # textwrap split words that contain hyphen, this is not
1087 # what we want for filenames, so the dirty hack is to
1088 # temporally replace hyphens with a char that a file cannot
1089 # contain, like "*"
1090 ret += [
1091 line.replace('*', '-')
1092 for line in textwrap.wrap(
1093 filestr.replace('-', '*'),
1094 wrapwidth,
1095 initial_indent='#: ',
1096 subsequent_indent='#: ',
1097 break_long_words=False,
1100 else:
1101 ret.append('#: ' + filestr)
1103 # flags (TODO: wrapping ?)
1104 if self.flags:
1105 ret.append('#, %s' % ', '.join(self.flags))
1107 # previous context and previous msgid/msgid_plural
1108 fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
1109 if self.obsolete:
1110 prefix = "#~| "
1111 else:
1112 prefix = "#| "
1113 for f in fields:
1114 val = getattr(self, f)
1115 if val is not None:
1116 ret += self._str_field(f, prefix, "", val, wrapwidth)
1118 ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1119 ret = u('\n').join(ret)
1120 return ret
1122 # pylint: disable=cmp-method,too-many-return-statements
1123 def __cmp__(self, other):
1125 Called by comparison operations if rich comparison is not defined.
1127 # First: Obsolete test
1128 if self.obsolete != other.obsolete:
1129 if self.obsolete:
1130 return -1
1131 else:
1132 return 1
1133 # Work on a copy to protect original
1134 occ1 = sorted(self.occurrences[:])
1135 occ2 = sorted(other.occurrences[:])
1136 if occ1 > occ2:
1137 return 1
1138 if occ1 < occ2:
1139 return -1
1140 # Compare context
1141 msgctxt = self.msgctxt or '0'
1142 othermsgctxt = other.msgctxt or '0'
1143 if msgctxt > othermsgctxt:
1144 return 1
1145 elif msgctxt < othermsgctxt:
1146 return -1
1147 # Compare msgid_plural
1148 msgid_plural = self.msgid_plural or '0'
1149 othermsgid_plural = other.msgid_plural or '0'
1150 if msgid_plural > othermsgid_plural:
1151 return 1
1152 elif msgid_plural < othermsgid_plural:
1153 return -1
1154 # Compare msgstr_plural
1155 if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1156 msgstr_plural = list(self.msgstr_plural.values())
1157 else:
1158 msgstr_plural = []
1159 if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1160 othermsgstr_plural = list(other.msgstr_plural.values())
1161 else:
1162 othermsgstr_plural = []
1163 if msgstr_plural > othermsgstr_plural:
1164 return 1
1165 elif msgstr_plural < othermsgstr_plural:
1166 return -1
1167 # Compare msgid
1168 if self.msgid > other.msgid:
1169 return 1
1170 elif self.msgid < other.msgid:
1171 return -1
1172 # Compare msgstr
1173 if self.msgstr > other.msgstr:
1174 return 1
1175 elif self.msgstr < other.msgstr:
1176 return -1
1177 return 0
1179 def __gt__(self, other):
1180 return self.__cmp__(other) > 0
1182 def __lt__(self, other):
1183 return self.__cmp__(other) < 0
1185 def __ge__(self, other):
1186 return self.__cmp__(other) >= 0
1188 def __le__(self, other):
1189 return self.__cmp__(other) <= 0
1191 def __eq__(self, other):
1192 return self.__cmp__(other) == 0
1194 def __ne__(self, other):
1195 return self.__cmp__(other) != 0
1197 def translated(self):
1199 Returns ``True`` if the entry has been translated or ``False``
1200 otherwise.
1202 if self.obsolete or self.fuzzy:
1203 return False
1204 if self.msgstr != '':
1205 return True
1206 if self.msgstr_plural:
1207 for pos in self.msgstr_plural:
1208 if self.msgstr_plural[pos] == '':
1209 return False
1210 return True
1211 return False
1213 def merge(self, other):
1215 Merge the current entry with the given pot entry.
1217 self.msgid = other.msgid
1218 self.msgctxt = other.msgctxt
1219 self.occurrences = other.occurrences
1220 self.comment = other.comment
1221 fuzzy = self.fuzzy
1222 self.flags = other.flags[:] # clone flags
1223 if fuzzy:
1224 self.flags.append('fuzzy')
1225 self.msgid_plural = other.msgid_plural
1226 self.obsolete = other.obsolete
1227 self.previous_msgctxt = other.previous_msgctxt
1228 self.previous_msgid = other.previous_msgid
1229 self.previous_msgid_plural = other.previous_msgid_plural
1230 if other.msgstr_plural:
1231 for pos in other.msgstr_plural:
1232 try:
1233 # keep existing translation at pos if any
1234 self.msgstr_plural[pos]
1235 except KeyError:
1236 self.msgstr_plural[pos] = ''
1238 @property
1239 def fuzzy(self):
1240 return 'fuzzy' in self.flags
1242 def __hash__(self):
1243 return hash((self.msgid, self.msgstr))
1246 # }}}
1247 # class MOEntry {{{
1250 class MOEntry(_BaseEntry):
1252 Represents a mo file entry.
1255 def __init__(self, *args, **kwargs):
1257 Constructor, accepts the following keyword arguments,
1258 for consistency with :class:`~polib.POEntry`:
1260 ``comment``
1261 ``tcomment``
1262 ``occurrences``
1263 ``flags``
1264 ``previous_msgctxt``
1265 ``previous_msgid``
1266 ``previous_msgid_plural``
1268 Note: even though these keyword arguments are accepted,
1269 they hold no real meaning in the context of MO files
1270 and are simply ignored.
1272 _BaseEntry.__init__(self, *args, **kwargs)
1273 self.comment = ''
1274 self.tcomment = ''
1275 self.occurrences = []
1276 self.flags = []
1277 self.previous_msgctxt = None
1278 self.previous_msgid = None
1279 self.previous_msgid_plural = None
1281 def __hash__(self):
1282 return hash((self.msgid, self.msgstr))
1285 # }}}
1286 # class _POFileParser {{{
1289 class _POFileParser(object):
1291 A finite state machine to parse efficiently and correctly po
1292 file format.
1295 # pylint: disable=redefined-outer-name
1296 def __init__(self, pofile, *_args, **kwargs):
1298 Constructor.
1300 Keyword arguments:
1302 ``pofile``
1303 string, path to the po file or its content
1305 ``encoding``
1306 string, the encoding to use, defaults to ``default_encoding``
1307 global variable (optional).
1309 ``check_for_duplicates``
1310 whether to check for duplicate entries when adding entries to the
1311 file (optional, default: ``False``).
1313 enc = kwargs.get('encoding', default_encoding)
1314 if _is_file(pofile):
1315 try:
1316 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1317 except LookupError:
1318 enc = default_encoding
1319 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1320 else:
1321 self.fhandle = pofile.splitlines()
1323 klass = kwargs.get('klass')
1324 if klass is None:
1325 klass = POFile
1326 self.instance = klass(
1327 pofile=pofile,
1328 encoding=enc,
1329 check_for_duplicates=kwargs.get('check_for_duplicates', False),
1331 self.transitions = {}
1332 self.current_line = 0
1333 self.current_entry = POEntry(linenum=self.current_line)
1334 self.current_state = 'st'
1335 self.current_token = None
1336 # two memo flags used in handlers
1337 self.msgstr_index = 0
1338 self.entry_obsolete = 0
1339 # Configure the state machine, by adding transitions.
1340 # Signification of symbols:
1341 # * ST: Beginning of the file (start)
1342 # * HE: Header
1343 # * TC: a translation comment
1344 # * GC: a generated comment
1345 # * OC: a file/line occurrence
1346 # * FL: a flags line
1347 # * CT: a message context
1348 # * PC: a previous msgctxt
1349 # * PM: a previous msgid
1350 # * PP: a previous msgid_plural
1351 # * MI: a msgid
1352 # * MP: a msgid plural
1353 # * MS: a msgstr
1354 # * MX: a msgstr plural
1355 # * MC: a msgid or msgstr continuation line
1356 # pylint: disable=redefined-builtin
1357 all = [
1358 'st',
1359 'he',
1360 'gc',
1361 'oc',
1362 'fl',
1363 'ct',
1364 'pc',
1365 'pm',
1366 'pp',
1367 'tc',
1368 'ms',
1369 'mp',
1370 'mx',
1371 'mi',
1374 self.add('tc', ['st', 'he'], 'he')
1375 self.add(
1376 'tc',
1377 ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mp', 'mx', 'mi'],
1378 'tc',
1380 self.add('gc', all, 'gc')
1381 self.add('oc', all, 'oc')
1382 self.add('fl', all, 'fl')
1383 self.add('pc', all, 'pc')
1384 self.add('pm', all, 'pm')
1385 self.add('pp', all, 'pp')
1386 self.add(
1387 'ct',
1388 ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1389 'ct',
1391 self.add(
1392 'mi',
1393 ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1394 'mi',
1396 self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1397 self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1398 self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1399 self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1401 # pylint: disable=too-many-branches
1402 def parse(self):
1404 Run the state machine, parse the file line by line and call process()
1405 with the current matched symbol.
1408 keywords = {
1409 'msgctxt': 'ct',
1410 'msgid': 'mi',
1411 'msgstr': 'ms',
1412 'msgid_plural': 'mp',
1414 prev_keywords = {
1415 'msgid_plural': 'pp',
1416 'msgid': 'pm',
1417 'msgctxt': 'pc',
1419 tokens = []
1420 fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1421 for line in self.fhandle:
1422 self.current_line += 1
1423 if self.current_line == 1:
1424 BOM = codecs.BOM_UTF8.decode('utf-8')
1425 if line.startswith(BOM):
1426 line = line[len(BOM) :]
1427 line = line.strip()
1428 if line == '':
1429 continue
1431 tokens = line.split(None, 2)
1432 nb_tokens = len(tokens)
1434 if tokens[0] == '#~|':
1435 continue
1437 if tokens[0] == '#~' and nb_tokens > 1:
1438 line = line[3:].strip()
1439 tokens = tokens[1:]
1440 nb_tokens -= 1
1441 self.entry_obsolete = 1
1442 else:
1443 self.entry_obsolete = 0
1445 # Take care of keywords like
1446 # msgid, msgid_plural, msgctxt & msgstr.
1447 if tokens[0] in keywords and nb_tokens > 1:
1448 line = line[len(tokens[0]) :].lstrip()
1449 if re.search(r'([^\\]|^)"', line[1:-1]):
1450 raise IOError(
1451 'Syntax error in po file %s(line %s): '
1452 'unescaped double quote found' % (fpath, self.current_line)
1454 self.current_token = line
1455 self.process(keywords[tokens[0]])
1456 continue
1458 self.current_token = line
1460 if tokens[0] == '#:':
1461 if nb_tokens <= 1:
1462 continue
1463 # we are on a occurrences line
1464 self.process('oc')
1466 elif line[:1] == '"':
1467 # we are on a continuation line
1468 if re.search(r'([^\\]|^)"', line[1:-1]):
1469 raise IOError(
1470 'Syntax error in po file %s(line %s): '
1471 'unescaped double quote found' % (fpath, self.current_line)
1473 self.process('mc')
1475 elif line[:7] == 'msgstr[':
1476 # we are on a msgstr plural
1477 self.process('mx')
1479 elif tokens[0] == '#,':
1480 if nb_tokens <= 1:
1481 continue
1482 # we are on a flags line
1483 self.process('fl')
1485 elif tokens[0] == '#' or tokens[0].startswith('##'):
1486 if line == '#':
1487 line += ' '
1488 # we are on a translator comment line
1489 self.process('tc')
1491 elif tokens[0] == '#.':
1492 if nb_tokens <= 1:
1493 continue
1494 # we are on a generated comment line
1495 self.process('gc')
1497 elif tokens[0] == '#|':
1498 if nb_tokens <= 1:
1499 raise IOError(
1500 'Syntax error in po file %s(line %s)'
1501 % (fpath, self.current_line)
1504 # Remove the marker and any whitespace right after that.
1505 line = line[2:].lstrip()
1506 self.current_token = line
1508 if tokens[1].startswith('"'):
1509 # Continuation of previous metadata.
1510 self.process('mc')
1511 continue
1513 if nb_tokens == 2:
1514 # Invalid continuation line.
1515 raise IOError(
1516 'Syntax error in po file %s(line %s): '
1517 'invalid continuation line' % (fpath, self.current_line)
1520 # we are on a "previous translation" comment line,
1521 if tokens[1] not in prev_keywords:
1522 # Unknown keyword in previous translation comment.
1523 raise IOError(
1524 'Syntax error in po file %s(line %s): '
1525 'unknown keyword %s' % (fpath, self.current_line, tokens[1])
1528 # Remove the keyword and any whitespace
1529 # between it and the starting quote.
1530 line = line[len(tokens[1]) :].lstrip()
1531 self.current_token = line
1532 self.process(prev_keywords[tokens[1]])
1534 else:
1535 raise IOError(
1536 'Syntax error in po file %s(line %s)' % (fpath, self.current_line)
1539 if self.current_entry and len(tokens) > 0 and not tokens[0].startswith('#'):
1540 # since entries are added when another entry is found, we must add
1541 # the last entry here (only if there are lines). Trailing comments
1542 # are ignored
1543 self.instance.append(self.current_entry)
1545 # before returning the instance, check if there's metadata and if
1546 # so extract it in a dict
1547 metadataentry = self.instance.find('')
1548 if metadataentry: # metadata found
1549 # remove the entry
1550 self.instance.remove(metadataentry)
1551 self.instance.metadata_is_fuzzy = metadataentry.flags
1552 key = None
1553 for msg in metadataentry.msgstr.splitlines():
1554 try:
1555 key, val = msg.split(':', 1)
1556 self.instance.metadata[key] = val.strip()
1557 except (ValueError, KeyError):
1558 if key is not None:
1559 self.instance.metadata[key] += '\n' + msg.strip()
1560 # close opened file
1561 if not isinstance(self.fhandle, list): # must be file
1562 self.fhandle.close()
1563 return self.instance
1565 def add(self, symbol, states, next_state):
1567 Add a transition to the state machine.
1569 Keywords arguments:
1571 ``symbol``
1572 string, the matched token (two chars symbol).
1574 ``states``
1575 list, a list of states (two chars symbols).
1577 ``next_state``
1578 the next state the fsm will have after the action.
1580 for state in states:
1581 action = getattr(self, 'handle_%s' % next_state)
1582 self.transitions[(symbol, state)] = (action, next_state)
1584 def process(self, symbol):
1586 Process the transition corresponding to the current state and the
1587 symbol provided.
1589 Keywords arguments:
1591 ``symbol``
1592 string, the matched token (two chars symbol).
1594 ``linenum``
1595 integer, the current line number of the parsed file.
1597 try:
1598 (action, state) = self.transitions[(symbol, self.current_state)]
1599 if action():
1600 self.current_state = state
1601 except Exception:
1602 fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1603 if hasattr(self.fhandle, 'close'):
1604 self.fhandle.close()
1605 raise IOError(
1606 'Syntax error in po file %s(line %s)' % (fpath, self.current_line)
1609 # state handlers
1611 def handle_he(self):
1612 """Handle a header comment."""
1613 if self.instance.header != '':
1614 self.instance.header += '\n'
1615 self.instance.header += self.current_token[2:]
1616 return 1
1618 def handle_tc(self):
1619 """Handle a translator comment."""
1620 if self.current_state in ['mc', 'ms', 'mx']:
1621 self.instance.append(self.current_entry)
1622 self.current_entry = POEntry(linenum=self.current_line)
1623 if self.current_entry.tcomment != '':
1624 self.current_entry.tcomment += '\n'
1625 tcomment = self.current_token.lstrip('#')
1626 if tcomment.startswith(' '):
1627 tcomment = tcomment[1:]
1628 self.current_entry.tcomment += tcomment
1629 return True
1631 def handle_gc(self):
1632 """Handle a generated comment."""
1633 if self.current_state in ['mc', 'ms', 'mx']:
1634 self.instance.append(self.current_entry)
1635 self.current_entry = POEntry(linenum=self.current_line)
1636 if self.current_entry.comment != '':
1637 self.current_entry.comment += '\n'
1638 self.current_entry.comment += self.current_token[3:]
1639 return True
1641 def handle_oc(self):
1642 """Handle a file:num occurrence."""
1643 if self.current_state in ['mc', 'ms', 'mx']:
1644 self.instance.append(self.current_entry)
1645 self.current_entry = POEntry(linenum=self.current_line)
1646 occurrences = self.current_token[3:].split()
1647 for occurrence in occurrences:
1648 if occurrence != '':
1649 try:
1650 fil, line = occurrence.rsplit(':', 1)
1651 if not line.isdigit():
1652 fil = occurrence
1653 line = ''
1654 self.current_entry.occurrences.append((fil, line))
1655 except (ValueError, AttributeError):
1656 self.current_entry.occurrences.append((occurrence, ''))
1657 return True
1659 def handle_fl(self):
1660 """Handle a flags line."""
1661 if self.current_state in ['mc', 'ms', 'mx']:
1662 self.instance.append(self.current_entry)
1663 self.current_entry = POEntry(linenum=self.current_line)
1664 self.current_entry.flags += [
1665 c.strip() for c in self.current_token[3:].split(',')
1667 return True
1669 def handle_pp(self):
1670 """Handle a previous msgid_plural line."""
1671 if self.current_state in ['mc', 'ms', 'mx']:
1672 self.instance.append(self.current_entry)
1673 self.current_entry = POEntry(linenum=self.current_line)
1674 self.current_entry.previous_msgid_plural = unescape(self.current_token[1:-1])
1675 return True
1677 def handle_pm(self):
1678 """Handle a previous msgid line."""
1679 if self.current_state in ['mc', 'ms', 'mx']:
1680 self.instance.append(self.current_entry)
1681 self.current_entry = POEntry(linenum=self.current_line)
1682 self.current_entry.previous_msgid = unescape(self.current_token[1:-1])
1683 return True
1685 def handle_pc(self):
1686 """Handle a previous msgctxt line."""
1687 if self.current_state in ['mc', 'ms', 'mx']:
1688 self.instance.append(self.current_entry)
1689 self.current_entry = POEntry(linenum=self.current_line)
1690 self.current_entry.previous_msgctxt = unescape(self.current_token[1:-1])
1691 return True
1693 def handle_ct(self):
1694 """Handle a msgctxt."""
1695 if self.current_state in ['mc', 'ms', 'mx']:
1696 self.instance.append(self.current_entry)
1697 self.current_entry = POEntry(linenum=self.current_line)
1698 self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1699 return True
1701 def handle_mi(self):
1702 """Handle a msgid."""
1703 if self.current_state in ['mc', 'ms', 'mx']:
1704 self.instance.append(self.current_entry)
1705 self.current_entry = POEntry(linenum=self.current_line)
1706 self.current_entry.obsolete = self.entry_obsolete
1707 self.current_entry.msgid = unescape(self.current_token[1:-1])
1708 return True
1710 def handle_mp(self):
1711 """Handle a msgid plural."""
1712 self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1713 return True
1715 def handle_ms(self):
1716 """Handle a msgstr."""
1717 self.current_entry.msgstr = unescape(self.current_token[1:-1])
1718 return True
1720 def handle_mx(self):
1721 """Handle a msgstr plural."""
1722 index = self.current_token[7]
1723 value = self.current_token[self.current_token.find('"') + 1 : -1]
1724 self.current_entry.msgstr_plural[int(index)] = unescape(value)
1725 self.msgstr_index = int(index)
1726 return True
1728 def handle_mc(self):
1729 """Handle a msgid or msgstr continuation line."""
1730 token = unescape(self.current_token[1:-1])
1731 if self.current_state == 'ct':
1732 self.current_entry.msgctxt += token
1733 elif self.current_state == 'mi':
1734 self.current_entry.msgid += token
1735 elif self.current_state == 'mp':
1736 self.current_entry.msgid_plural += token
1737 elif self.current_state == 'ms':
1738 self.current_entry.msgstr += token
1739 elif self.current_state == 'mx':
1740 self.current_entry.msgstr_plural[self.msgstr_index] += token
1741 elif self.current_state == 'pp':
1742 self.current_entry.previous_msgid_plural += token
1743 elif self.current_state == 'pm':
1744 self.current_entry.previous_msgid += token
1745 elif self.current_state == 'pc':
1746 self.current_entry.previous_msgctxt += token
1747 # don't change the current state
1748 return False
1751 # }}}
1752 # class _MOFileParser {{{
1755 class _MOFileParser(object):
1757 A class to parse binary mo files.
1760 # pylint: disable=unused-argument,redefined-outer-name
1761 def __init__(self, mofile, *_args, **kwargs):
1763 Constructor.
1765 Keyword arguments:
1767 ``mofile``
1768 string, path to the mo file or its content
1770 ``encoding``
1771 string, the encoding to use, defaults to ``default_encoding``
1772 global variable (optional).
1774 ``check_for_duplicates``
1775 whether to check for duplicate entries when adding entries to the
1776 file (optional, default: ``False``).
1778 if _is_file(mofile):
1779 self.fhandle = open(mofile, 'rb')
1780 else:
1781 self.fhandle = io.BytesIO(mofile)
1783 klass = kwargs.get('klass')
1784 if klass is None:
1785 klass = MOFile
1786 self.instance = klass(
1787 fpath=mofile,
1788 encoding=kwargs.get('encoding', default_encoding),
1789 check_for_duplicates=kwargs.get('check_for_duplicates', False),
1792 def __del__(self):
1794 Make sure the file is closed, this prevents warnings on unclosed file
1795 when running tests with python >= 3.2.
1797 if self.fhandle and hasattr(self.fhandle, 'close'):
1798 self.fhandle.close()
1800 def parse(self):
1802 Build the instance with the file handle provided in the
1803 constructor.
1805 # parse magic number
1806 magic_number = self._readbinary('<I', 4)
1807 if magic_number == MOFile.MAGIC:
1808 ii = '<II'
1809 elif magic_number == MOFile.MAGIC_SWAPPED:
1810 ii = '>II'
1811 else:
1812 raise IOError('Invalid mo file, magic number is incorrect !')
1813 self.instance.magic_number = magic_number
1814 # parse the version number and the number of strings
1815 version, numofstrings = self._readbinary(ii, 8)
1816 # from MO file format specs: "A program seeing an unexpected major
1817 # revision number should stop reading the MO file entirely"
1818 if version >> 16 not in (0, 1):
1819 raise IOError('Invalid mo file, unexpected major revision number')
1820 self.instance.version = version
1821 # original strings and translation strings hash table offset
1822 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1823 # move to msgid hash table and read length and offset of msgids
1824 self.fhandle.seek(msgids_hash_offset)
1825 msgids_index = []
1826 for i in range(numofstrings):
1827 msgids_index.append(self._readbinary(ii, 8))
1828 # move to msgstr hash table and read length and offset of msgstrs
1829 self.fhandle.seek(msgstrs_hash_offset)
1830 msgstrs_index = []
1831 for i in range(numofstrings):
1832 msgstrs_index.append(self._readbinary(ii, 8))
1833 # build entries
1834 encoding = self.instance.encoding
1835 for i in range(numofstrings):
1836 self.fhandle.seek(msgids_index[i][1])
1837 msgid = self.fhandle.read(msgids_index[i][0])
1839 self.fhandle.seek(msgstrs_index[i][1])
1840 msgstr = self.fhandle.read(msgstrs_index[i][0])
1841 if i == 0 and not msgid: # metadata
1842 raw_metadata, metadata = msgstr.split(b('\n')), {}
1843 for line in raw_metadata:
1844 tokens = line.split(b(':'), 1)
1845 if tokens[0] != b(''):
1846 try:
1847 k = tokens[0].decode(encoding)
1848 v = tokens[1].decode(encoding)
1849 metadata[k] = v.strip()
1850 except IndexError:
1851 metadata[k] = u('')
1852 self.instance.metadata = metadata
1853 continue
1854 # test if we have a plural entry
1855 msgid_tokens = msgid.split(b('\0'))
1856 if len(msgid_tokens) > 1:
1857 entry = self._build_entry(
1858 msgid=msgid_tokens[0],
1859 msgid_plural=msgid_tokens[1],
1860 msgstr_plural=dict(
1861 (k, v) for k, v in enumerate(msgstr.split(b('\0')))
1864 else:
1865 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1866 self.instance.append(entry)
1867 # close opened file
1868 self.fhandle.close()
1869 return self.instance
1871 def _build_entry(self, msgid, msgstr=None, msgid_plural=None, msgstr_plural=None):
1872 msgctxt_msgid = msgid.split(b('\x04'))
1873 encoding = self.instance.encoding
1874 if len(msgctxt_msgid) > 1:
1875 kwargs = {
1876 'msgctxt': msgctxt_msgid[0].decode(encoding),
1877 'msgid': msgctxt_msgid[1].decode(encoding),
1879 else:
1880 kwargs = {'msgid': msgid.decode(encoding)}
1881 if msgstr:
1882 kwargs['msgstr'] = msgstr.decode(encoding)
1883 if msgid_plural:
1884 kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1885 if msgstr_plural:
1886 for k in msgstr_plural:
1887 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1888 kwargs['msgstr_plural'] = msgstr_plural
1889 return MOEntry(**kwargs)
1891 def _readbinary(self, fmt, numbytes):
1893 Private method that unpack n bytes of data using format <fmt>.
1894 It returns a tuple or a mixed value if the tuple length is 1.
1896 content = self.fhandle.read(numbytes)
1897 tup = struct.unpack(fmt, content)
1898 if len(tup) == 1:
1899 return tup[0]
1900 return tup
1903 # }}}