docs: remove references to jaraco.packaging.sphinx
[git-cola.git] / cola / polib.py
blob1f59dd2ddce2c0ef3dc547a141a74990dd88ed88
2 # License: MIT (see extras/polib/LICENSE file provided)
3 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
4 # pylint: disable=consider-using-with,no-else-return
6 """
7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
8 mo files). You can load existing files, iterate through it's entries, add,
9 modify entries, comments or metadata, etc. or create new po files from scratch.
11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
12 :func:`~polib.mofile` convenience functions.
13 """
14 import array
15 import codecs
16 import os
17 import re
18 import struct
19 import sys
20 import textwrap
21 import io
23 from . import compat
26 __author__ = 'David Jean Louis <izimobil@gmail.com>'
27 __version__ = '1.1.1'
28 __all__ = [
29 'pofile',
30 'POFile',
31 'POEntry',
32 'mofile',
33 'MOFile',
34 'MOEntry',
35 'default_encoding',
36 'escape',
37 'unescape',
38 'detect_encoding',
42 # the default encoding to use when encoding cannot be detected
43 default_encoding = 'utf-8'
45 # python 2/3 compatibility helpers {{{
48 PY3 = True
49 text_type = str
52 def b(s):
53 return s.encode('utf-8')
56 def u(s):
57 return s
60 # }}}
61 # _pofile_or_mofile {{{
64 def _pofile_or_mofile(f, filetype, **kwargs):
65 """
66 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
67 honor the DRY concept.
68 """
69 # get the file encoding
70 enc = kwargs.get('encoding')
71 if enc is None:
72 enc = detect_encoding(f, filetype == 'mofile')
74 # parse the file
75 kls = _POFileParser if filetype == 'pofile' else _MOFileParser
76 parser = kls(
78 encoding=enc,
79 check_for_duplicates=kwargs.get('check_for_duplicates', False),
80 klass=kwargs.get('klass'),
82 instance = parser.parse()
83 instance.wrapwidth = kwargs.get('wrapwidth', 78)
84 return instance
87 # }}}
88 # _is_file {{{
91 def _is_file(filename_or_contents):
92 """
93 Safely returns the value of os.path.exists(filename_or_contents).
95 Arguments:
97 ``filename_or_contents``
98 either a filename, or a string holding the contents of some file.
99 In the latter case, this function will always return False.
101 try:
102 return os.path.isfile(filename_or_contents)
103 except (TypeError, ValueError, UnicodeEncodeError):
104 return False
107 # }}}
108 # function pofile() {{{
111 # pylint: disable=redefined-outer-name
112 def pofile(pofile, **kwargs):
114 Convenience function that parses the po or pot file ``pofile`` and returns
115 a :class:`~polib.POFile` instance.
117 Arguments:
119 ``pofile``
120 string, full or relative path to the po/pot file or its content (data).
122 ``wrapwidth``
123 integer, the wrap width, only useful when the ``-w`` option was passed
124 to xgettext (optional, default: ``78``).
126 ``encoding``
127 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
128 encoding will be auto-detected).
130 ``check_for_duplicates``
131 whether to check for duplicate entries when adding entries to the
132 file (optional, default: ``False``).
134 ``klass``
135 class which is used to instantiate the return value (optional,
136 default: ``None``, the return value with be a :class:`~polib.POFile`
137 instance).
139 return _pofile_or_mofile(pofile, 'pofile', **kwargs)
142 # }}}
143 # function mofile() {{{
146 # pylint: disable=redefined-outer-name
147 def mofile(mofile, **kwargs):
149 Convenience function that parses the mo file ``mofile`` and returns a
150 :class:`~polib.MOFile` instance.
152 Arguments:
154 ``mofile``
155 string, full or relative path to the mo file or its content (string
156 or bytes).
158 ``wrapwidth``
159 integer, the wrap width, only useful when the ``-w`` option was passed
160 to xgettext to generate the po file that was used to format the mo file
161 (optional, default: ``78``).
163 ``encoding``
164 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
165 encoding will be auto-detected).
167 ``check_for_duplicates``
168 whether to check for duplicate entries when adding entries to the
169 file (optional, default: ``False``).
171 ``klass``
172 class which is used to instantiate the return value (optional,
173 default: ``None``, the return value with be a :class:`~polib.POFile`
174 instance).
176 return _pofile_or_mofile(mofile, 'mofile', **kwargs)
179 # }}}
180 # function detect_encoding() {{{
183 def detect_encoding(file, binary_mode=False):
185 Try to detect the encoding used by the ``file``. The ``file`` argument can
186 be a PO or MO file path or a string containing the contents of the file.
187 If the encoding cannot be detected, the function will return the value of
188 ``default_encoding``.
190 Arguments:
192 ``file``
193 string, full or relative path to the po/mo file or its content.
195 ``binary_mode``
196 boolean, set this to True if ``file`` is a mo file.
198 PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
199 rxt = re.compile(u(PATTERN))
200 rxb = re.compile(b(PATTERN))
202 def charset_exists(charset):
203 """Check whether ``charset`` is valid or not."""
204 try:
205 codecs.lookup(charset)
206 except LookupError:
207 return False
208 return True
210 if not _is_file(file):
211 try:
212 match = rxt.search(file)
213 except TypeError:
214 match = rxb.search(file)
215 if match:
216 enc = match.group(1).strip()
217 if not isinstance(enc, text_type):
218 enc = enc.decode('utf-8')
219 if charset_exists(enc):
220 return enc
221 else:
222 # For PY3, always treat as binary
223 if binary_mode or PY3:
224 mode = 'rb'
225 rx = rxb
226 else:
227 mode = 'r'
228 rx = rxt
229 f = open(file, mode)
230 for line in f.readlines():
231 match = rx.search(line)
232 if match:
233 f.close()
234 enc = match.group(1).strip()
235 if not isinstance(enc, text_type):
236 enc = enc.decode('utf-8')
237 if charset_exists(enc):
238 return enc
239 f.close()
240 return default_encoding
243 # }}}
244 # function escape() {{{
247 def escape(st):
249 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
250 the given string ``st`` and returns it.
252 return (
253 st.replace('\\', r'\\')
254 .replace('\t', r'\t')
255 .replace('\r', r'\r')
256 .replace('\n', r'\n')
257 .replace('"', r'\"')
261 # }}}
262 # function unescape() {{{
265 def unescape(st):
267 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
268 the given string ``st`` and returns it.
271 def unescape_repl(m):
272 m = m.group(1)
273 if m == 'n':
274 return '\n'
275 if m == 't':
276 return '\t'
277 if m == 'r':
278 return '\r'
279 if m == '\\':
280 return '\\'
281 return m # handles escaped double quote
283 return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
286 # }}}
287 # function natural_sort() {{{
290 def natural_sort(lst):
292 Sort naturally the given list.
293 Credits: http://stackoverflow.com/a/4836734
296 def convert(text):
297 return int(text) if text.isdigit() else text.lower()
299 def alphanum_key(key):
300 return [convert(c) for c in re.split('([0-9]+)', key)]
302 return sorted(lst, key=alphanum_key)
305 # }}}
306 # class _BaseFile {{{
309 class _BaseFile(list):
311 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
312 classes. This class should **not** be instantiated directly.
315 def __init__(self, *_args, **kwargs):
317 Constructor, accepts the following keyword arguments:
319 ``pofile``
320 string, the path to the po or mo file, or its content as a string.
322 ``wrapwidth``
323 integer, the wrap width, only useful when the ``-w`` option was
324 passed to xgettext (optional, default: ``78``).
326 ``encoding``
327 string, the encoding to use, defaults to ``default_encoding``
328 global variable (optional).
330 ``check_for_duplicates``
331 whether to check for duplicate entries when adding entries to the
332 file, (optional, default: ``False``).
334 list.__init__(self)
335 # the opened file handle
336 pofile = kwargs.get('pofile', None) # pylint: disable=redefined-outer-name
337 if pofile and _is_file(pofile):
338 self.fpath = pofile
339 else:
340 self.fpath = kwargs.get('fpath')
341 # the width at which lines should be wrapped
342 self.wrapwidth = kwargs.get('wrapwidth', 78)
343 # the file encoding
344 self.encoding = kwargs.get('encoding', default_encoding)
345 # whether to check for duplicate entries or not
346 self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
347 # header
348 self.header = ''
349 # both po and mo files have metadata
350 self.metadata = {}
351 self.metadata_is_fuzzy = 0
353 def __unicode__(self):
355 Returns the unicode representation of the file.
357 ret = []
358 entries = [self.metadata_as_entry()] + [e for e in self if not e.obsolete]
359 for entry in entries:
360 ret.append(entry.__unicode__(self.wrapwidth))
361 for entry in self.obsolete_entries(): # pylint: disable=no-member
362 ret.append(entry.__unicode__(self.wrapwidth))
363 ret = u('\n').join(ret)
364 return ret
366 if PY3:
368 def __str__(self):
369 return self.__unicode__()
371 else:
373 def __str__(self):
375 Returns the string representation of the file.
377 return compat.ustr(self).encode(self.encoding)
379 def __contains__(self, entry):
381 Overridden ``list`` method to implement the membership test (in and
382 not in).
383 The method considers that an entry is in the file if it finds an entry
384 that has the same msgid (the test is **case sensitive**) and the same
385 msgctxt (or none for both entries).
387 Argument:
389 ``entry``
390 an instance of :class:`~polib._BaseEntry`.
392 return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) is not None
394 def __eq__(self, other):
395 return str(self) == str(other)
397 def __hash__(self):
398 return hash(str(self))
400 def append(self, entry):
402 Overridden method to check for duplicates entries, if a user tries to
403 add an entry that is already in the file, the method will raise a
404 ``ValueError`` exception.
406 Argument:
408 ``entry``
409 an instance of :class:`~polib._BaseEntry`.
411 # check_for_duplicates may not be defined (yet) when unpickling.
412 # But if pickling, we never want to check for duplicates anyway.
413 if getattr(self, 'check_for_duplicates', False) and entry in self:
414 raise ValueError('Entry "%s" already exists' % entry.msgid)
415 super().append(entry)
417 def insert(self, index, entry):
419 Overridden method to check for duplicates entries, if a user tries to
420 add an entry that is already in the file, the method will raise a
421 ``ValueError`` exception.
423 Arguments:
425 ``index``
426 index at which the entry should be inserted.
428 ``entry``
429 an instance of :class:`~polib._BaseEntry`.
431 if self.check_for_duplicates and entry in self:
432 raise ValueError('Entry "%s" already exists' % entry.msgid)
433 super().insert(index, entry)
435 def metadata_as_entry(self):
437 Returns the file metadata as a :class:`~polib.POFile` instance.
439 e = POEntry(msgid='')
440 mdata = self.ordered_metadata()
441 if mdata:
442 strs = []
443 for name, value in mdata:
444 # Strip whitespace off each line in a multi-line entry
445 strs.append(f'{name}: {value}')
446 e.msgstr = '\n'.join(strs) + '\n'
447 if self.metadata_is_fuzzy:
448 e.flags.append('fuzzy')
449 return e
451 def save(self, fpath=None, repr_method='__unicode__', newline=None):
453 Saves the po file to ``fpath``.
454 If it is an existing file and no ``fpath`` is provided, then the
455 existing file is rewritten with the modified data.
457 Keyword arguments:
459 ``fpath``
460 string, full or relative path to the file.
462 ``repr_method``
463 string, the method to use for output.
465 ``newline``
466 string, controls how universal newlines works
468 if self.fpath is None and fpath is None:
469 raise OSError('You must provide a file path to save() method')
470 contents = getattr(self, repr_method)()
471 if fpath is None:
472 fpath = self.fpath
473 if repr_method == 'to_binary':
474 fhandle = open(fpath, 'wb')
475 else:
476 fhandle = open(fpath, 'w', encoding=self.encoding, newline=newline)
477 if not isinstance(contents, text_type):
478 contents = contents.decode(self.encoding)
479 fhandle.write(contents)
480 fhandle.close()
481 # set the file path if not set
482 if self.fpath is None and fpath:
483 self.fpath = fpath
485 def find(self, st, by='msgid', include_obsolete_entries=False, msgctxt=False):
487 Find the entry which msgid (or property identified by the ``by``
488 argument) matches the string ``st``.
490 Keyword arguments:
492 ``st``
493 string, the string to search for.
495 ``by``
496 string, the property to use for comparison (default: ``msgid``).
498 ``include_obsolete_entries``
499 boolean, whether to also search in entries that are obsolete.
501 ``msgctxt``
502 string, allows specifying a specific message context for the
503 search.
505 if include_obsolete_entries:
506 entries = self[:]
507 else:
508 entries = [e for e in self if not e.obsolete]
509 matches = []
510 for e in entries:
511 if getattr(e, by) == st:
512 if msgctxt is not False and e.msgctxt != msgctxt:
513 continue
514 matches.append(e)
515 if len(matches) == 1:
516 return matches[0]
517 elif len(matches) > 1:
518 if not msgctxt:
519 # find the entry with no msgctx
520 e = None
521 for m in matches:
522 if not m.msgctxt:
523 e = m
524 if e:
525 return e
526 # fallback to the first entry found
527 return matches[0]
528 return None
530 def ordered_metadata(self):
532 Convenience method that returns an ordered version of the metadata
533 dictionary. The return value is list of tuples (metadata name,
534 metadata_value).
536 # copy the dict first
537 metadata = self.metadata.copy()
538 data_order = [
539 'Project-Id-Version',
540 'Report-Msgid-Bugs-To',
541 'POT-Creation-Date',
542 'PO-Revision-Date',
543 'Last-Translator',
544 'Language-Team',
545 'Language',
546 'MIME-Version',
547 'Content-Type',
548 'Content-Transfer-Encoding',
549 'Plural-Forms',
551 ordered_data = []
552 for data in data_order:
553 try:
554 value = metadata.pop(data)
555 ordered_data.append((data, value))
556 except KeyError:
557 pass
558 # the rest of the metadata will be alphabetically ordered since there
559 # are no specs for this AFAIK
560 for data in natural_sort(metadata.keys()):
561 value = metadata[data]
562 ordered_data.append((data, value))
563 return ordered_data
565 def to_binary(self):
567 Return the binary representation of the file.
569 offsets = []
570 entries = self.translated_entries() # pylint: disable=no-member
572 # the keys are sorted in the .mo file
573 def cmp(_self, other): # pylint: disable=unused-variable
574 # msgfmt compares entries with msgctxt if it exists
575 self_msgid = _self.msgctxt or _self.msgid
576 other_msgid = other.msgctxt or other.msgid
577 if self_msgid > other_msgid:
578 return 1
579 elif self_msgid < other_msgid:
580 return -1
581 else:
582 return 0
584 # add metadata entry
585 entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
586 mentry = self.metadata_as_entry()
587 entries = [mentry] + entries
588 entries_len = len(entries)
589 ids, strs = b(''), b('')
590 for e in entries:
591 # For each string, we need size and file offset. Each string is
592 # NUL terminated; the NUL does not count into the size.
593 msgid = b('')
594 if e.msgctxt:
595 # Contexts are stored by storing the concatenation of the
596 # context, a <EOT> byte, and the original string
597 msgid = self._encode(e.msgctxt + '\4')
598 if e.msgid_plural:
599 msgstr = []
600 for index in sorted(e.msgstr_plural.keys()):
601 msgstr.append(e.msgstr_plural[index])
602 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
603 msgstr = self._encode('\0'.join(msgstr))
604 else:
605 msgid += self._encode(e.msgid)
606 msgstr = self._encode(e.msgstr)
607 offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
608 ids += msgid + b('\0')
609 strs += msgstr + b('\0')
611 # The header is 7 32-bit unsigned integers.
612 keystart = 7 * 4 + 16 * entries_len
613 # and the values start after the keys
614 valuestart = keystart + len(ids)
615 koffsets = []
616 voffsets = []
617 # The string table first has the list of keys, then the list of values.
618 # Each entry has first the size of the string, then the file offset.
619 for o1, l1, o2, l2 in offsets:
620 koffsets += [l1, o1 + keystart]
621 voffsets += [l2, o2 + valuestart]
622 offsets = koffsets + voffsets
624 output = struct.pack(
625 'Iiiiiii',
626 # Magic number
627 MOFile.MAGIC,
628 # Version
630 # number of entries
631 entries_len,
632 # start of key index
633 7 * 4,
634 # start of value index
635 7 * 4 + entries_len * 8,
636 # size and offset of hash table, we don't use hash tables
638 keystart,
640 if PY3 and sys.version_info.minor > 1: # python 3.2 or superior
641 output += array.array('i', offsets).tobytes()
642 else:
643 output += array.array('i', offsets).tostring() # pylint: disable=no-member
644 output += ids
645 output += strs
646 return output
648 def _encode(self, mixed):
650 Encodes the given ``mixed`` argument with the file encoding if and
651 only if it's an unicode string and returns the encoded string.
653 if isinstance(mixed, text_type):
654 mixed = mixed.encode(self.encoding)
655 return mixed
658 # }}}
659 # class POFile {{{
662 class POFile(_BaseFile):
664 Po (or Pot) file reader/writer.
665 This class inherits the :class:`~polib._BaseFile` class and, by extension,
666 the python ``list`` type.
669 def __unicode__(self):
671 Returns the unicode representation of the po file.
673 ret, headers = '', self.header.split('\n')
674 for header in headers:
675 if not header:
676 ret += '#\n'
677 elif header[:1] in [',', ':']:
678 ret += '#%s\n' % header
679 else:
680 ret += '# %s\n' % header
682 if not isinstance(ret, text_type):
683 ret = ret.decode(self.encoding)
685 return ret + _BaseFile.__unicode__(self)
687 def save_as_mofile(self, fpath):
689 Saves the binary representation of the file to given ``fpath``.
691 Keyword argument:
693 ``fpath``
694 string, full or relative path to the mo file.
696 _BaseFile.save(self, fpath, 'to_binary')
698 def percent_translated(self):
700 Convenience method that returns the percentage of translated
701 messages.
703 total = len([e for e in self if not e.obsolete])
704 if total == 0:
705 return 100
706 translated = len(self.translated_entries())
707 return int(translated * 100 / float(total))
709 def translated_entries(self):
711 Convenience method that returns the list of translated entries.
713 return [e for e in self if e.translated()]
715 def untranslated_entries(self):
717 Convenience method that returns the list of untranslated entries.
719 return [
720 e for e in self if not e.translated() and not e.obsolete and not e.fuzzy
723 def fuzzy_entries(self):
725 Convenience method that returns the list of fuzzy entries.
727 return [e for e in self if e.fuzzy and not e.obsolete]
729 def obsolete_entries(self):
731 Convenience method that returns the list of obsolete entries.
733 return [e for e in self if e.obsolete]
735 def merge(self, refpot):
737 Convenience method that merges the current pofile with the pot file
738 provided. It behaves exactly as the gettext msgmerge utility:
740 * comments of this file will be preserved, but extracted comments and
741 occurrences will be discarded;
742 * any translations or comments in the file will be discarded, however,
743 dot comments and file positions will be preserved;
744 * the fuzzy flags are preserved.
746 Keyword argument:
748 ``refpot``
749 object POFile, the reference catalog.
751 # Store entries in dict/set for faster access
752 self_entries = {entry.msgid_with_context: entry for entry in self}
753 refpot_msgids = {entry.msgid_with_context for entry in refpot}
754 # Merge entries that are in the refpot
755 for entry in refpot:
756 e = self_entries.get(entry.msgid_with_context)
757 if e is None:
758 e = POEntry()
759 self.append(e)
760 e.merge(entry)
761 # ok, now we must "obsolete" entries that are not in the refpot anymore
762 for entry in self:
763 if entry.msgid_with_context not in refpot_msgids:
764 entry.obsolete = True
767 # }}}
768 # class MOFile {{{
771 class MOFile(_BaseFile):
773 Mo file reader/writer.
774 This class inherits the :class:`~polib._BaseFile` class and, by
775 extension, the python ``list`` type.
778 MAGIC = 0x950412DE
779 MAGIC_SWAPPED = 0xDE120495
781 def __init__(self, *args, **kwargs):
783 Constructor, accepts all keywords arguments accepted by
784 :class:`~polib._BaseFile` class.
786 _BaseFile.__init__(self, *args, **kwargs)
787 self.magic_number = None
788 self.version = 0
790 def save_as_pofile(self, fpath):
792 Saves the mofile as a pofile to ``fpath``.
794 Keyword argument:
796 ``fpath``
797 string, full or relative path to the file.
799 _BaseFile.save(self, fpath)
801 # pylint: disable=arguments-differ
802 def save(self, fpath=None):
804 Saves the mofile to ``fpath``.
806 Keyword argument:
808 ``fpath``
809 string, full or relative path to the file.
811 _BaseFile.save(self, fpath, 'to_binary')
813 def percent_translated(self):
815 Convenience method to keep the same interface with POFile instances.
817 return 100
819 def translated_entries(self):
821 Convenience method to keep the same interface with POFile instances.
823 return self
825 def untranslated_entries(self):
827 Convenience method to keep the same interface with POFile instances.
829 return []
831 def fuzzy_entries(self):
833 Convenience method to keep the same interface with POFile instances.
835 return []
837 def obsolete_entries(self):
839 Convenience method to keep the same interface with POFile instances.
841 return []
844 # }}}
845 # class _BaseEntry {{{
848 class _BaseEntry:
850 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
851 This class should **not** be instantiated directly.
854 def __init__(self, *_args, **kwargs):
856 Constructor, accepts the following keyword arguments:
858 ``msgid``
859 string, the entry msgid.
861 ``msgstr``
862 string, the entry msgstr.
864 ``msgid_plural``
865 string, the entry msgid_plural.
867 ``msgstr_plural``
868 dict, the entry msgstr_plural lines.
870 ``msgctxt``
871 string, the entry context (msgctxt).
873 ``obsolete``
874 bool, whether the entry is "obsolete" or not.
876 ``encoding``
877 string, the encoding to use, defaults to ``default_encoding``
878 global variable (optional).
880 self.msgid = kwargs.get('msgid', '')
881 self.msgstr = kwargs.get('msgstr', '')
882 self.msgid_plural = kwargs.get('msgid_plural', '')
883 self.msgstr_plural = kwargs.get('msgstr_plural', {})
884 self.msgctxt = kwargs.get('msgctxt', None)
885 self.obsolete = kwargs.get('obsolete', False)
886 self.encoding = kwargs.get('encoding', default_encoding)
888 def __unicode__(self, wrapwidth=78):
890 Returns the unicode representation of the entry.
892 if self.obsolete:
893 delflag = '#~ '
894 else:
895 delflag = ''
896 ret = []
897 # write the msgctxt if any
898 if self.msgctxt is not None:
899 ret += self._str_field('msgctxt', delflag, '', self.msgctxt, wrapwidth)
900 # write the msgid
901 ret += self._str_field('msgid', delflag, '', self.msgid, wrapwidth)
902 # write the msgid_plural if any
903 if self.msgid_plural:
904 ret += self._str_field(
905 'msgid_plural', delflag, '', self.msgid_plural, wrapwidth
907 if self.msgstr_plural:
908 # write the msgstr_plural if any
909 msgstrs = self.msgstr_plural
910 keys = list(msgstrs)
911 keys.sort()
912 for index in keys:
913 msgstr = msgstrs[index]
914 plural_index = '[%s]' % index
915 ret += self._str_field(
916 'msgstr', delflag, plural_index, msgstr, wrapwidth
918 else:
919 # otherwise write the msgstr
920 ret += self._str_field('msgstr', delflag, '', self.msgstr, wrapwidth)
921 ret.append('')
922 ret = u('\n').join(ret)
923 return ret
925 if PY3:
927 def __str__(self):
928 return self.__unicode__()
930 else:
932 def __str__(self):
934 Returns the string representation of the entry.
936 return compat.ustr(self).encode(self.encoding)
938 def __eq__(self, other):
939 return str(self) == str(other)
941 def __hash__(self):
942 return hash(str(self))
944 def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78):
945 lines = field.splitlines(True)
946 if len(lines) > 1:
947 lines = [''] + lines # start with initial empty line
948 else:
949 escaped_field = escape(field)
950 specialchars_count = 0
951 for c in ['\\', '\n', '\r', '\t', '"']:
952 specialchars_count += field.count(c)
953 # comparison must take into account fieldname length + one space
954 # + 2 quotes (eg. msgid "<string>")
955 flength = len(fieldname) + 3
956 if plural_index:
957 flength += len(plural_index)
958 real_wrapwidth = wrapwidth - flength + specialchars_count
959 if wrapwidth > 0 and len(field) > real_wrapwidth:
960 # Wrap the line but take field name into account
961 lines = [''] + [
962 unescape(item)
963 for item in textwrap.wrap(
964 escaped_field,
965 wrapwidth - 2, # 2 for quotes ""
966 drop_whitespace=False,
967 break_long_words=False,
970 else:
971 lines = [field]
972 if fieldname.startswith('previous_'):
973 # quick and dirty trick to get the real field name
974 fieldname = fieldname[9:]
976 ret = [f'{delflag}{fieldname}{plural_index} "{escape(lines.pop(0))}"']
977 for line in lines:
978 ret.append(f'{delflag}"{escape(line)}"')
979 return ret
981 @property
982 def msgid_with_context(self):
983 if self.msgctxt:
984 return '{}{}{}'.format(self.msgctxt, '\x04', self.msgid)
985 return self.msgid
988 # }}}
989 # class POEntry {{{
992 class POEntry(_BaseEntry):
994 Represents a po file entry.
997 def __init__(self, *args, **kwargs):
999 Constructor, accepts the following keyword arguments:
1001 ``comment``
1002 string, the entry comment.
1004 ``tcomment``
1005 string, the entry translator comment.
1007 ``occurrences``
1008 list, the entry occurrences.
1010 ``flags``
1011 list, the entry flags.
1013 ``previous_msgctxt``
1014 string, the entry previous context.
1016 ``previous_msgid``
1017 string, the entry previous msgid.
1019 ``previous_msgid_plural``
1020 string, the entry previous msgid_plural.
1022 ``linenum``
1023 integer, the line number of the entry
1025 _BaseEntry.__init__(self, *args, **kwargs)
1026 self.comment = kwargs.get('comment', '')
1027 self.tcomment = kwargs.get('tcomment', '')
1028 self.occurrences = kwargs.get('occurrences', [])
1029 self.flags = kwargs.get('flags', [])
1030 self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
1031 self.previous_msgid = kwargs.get('previous_msgid', None)
1032 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
1033 self.linenum = kwargs.get('linenum', None)
1035 def __unicode__(self, wrapwidth=78):
1037 Returns the unicode representation of the entry.
1039 ret = []
1040 # comments first, if any (with text wrapping as xgettext does)
1041 if self.obsolete:
1042 comments = [('tcomment', '# ')]
1043 else:
1044 comments = [('comment', '#. '), ('tcomment', '# ')]
1045 for c in comments:
1046 val = getattr(self, c[0])
1047 if val:
1048 for comment in val.split('\n'):
1049 if len(comment) + len(c[1]) > wrapwidth > 0:
1050 ret += textwrap.wrap(
1051 comment,
1052 wrapwidth,
1053 initial_indent=c[1],
1054 subsequent_indent=c[1],
1055 break_long_words=False,
1057 else:
1058 ret.append(f'{c[1]}{comment}')
1060 # occurrences (with text wrapping as xgettext does)
1061 if not self.obsolete and self.occurrences:
1062 filelist = []
1063 for fpath, lineno in self.occurrences:
1064 if lineno:
1065 filelist.append(f'{fpath}:{lineno}')
1066 else:
1067 filelist.append(fpath)
1068 filestr = ' '.join(filelist)
1069 if len(filestr) + 3 > wrapwidth > 0:
1070 # textwrap split words that contain hyphen, this is not
1071 # what we want for filenames, so the dirty hack is to
1072 # temporally replace hyphens with a char that a file cannot
1073 # contain, like "*"
1074 ret += [
1075 line.replace('*', '-')
1076 for line in textwrap.wrap(
1077 filestr.replace('-', '*'),
1078 wrapwidth,
1079 initial_indent='#: ',
1080 subsequent_indent='#: ',
1081 break_long_words=False,
1084 else:
1085 ret.append('#: ' + filestr)
1087 # flags (TODO: wrapping ?)
1088 if self.flags:
1089 ret.append('#, %s' % ', '.join(self.flags))
1091 # previous context and previous msgid/msgid_plural
1092 fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
1093 if self.obsolete:
1094 prefix = '#~| '
1095 else:
1096 prefix = '#| '
1097 for f in fields:
1098 val = getattr(self, f)
1099 if val is not None:
1100 ret += self._str_field(f, prefix, '', val, wrapwidth)
1102 ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1103 ret = u('\n').join(ret)
1104 return ret
1106 # pylint: disable=too-many-return-statements
1107 def __cmp__(self, other):
1109 Called by comparison operations if rich comparison is not defined.
1111 # First: Obsolete test
1112 if self.obsolete != other.obsolete:
1113 if self.obsolete:
1114 return -1
1115 else:
1116 return 1
1117 # Work on a copy to protect original
1118 occ1 = sorted(self.occurrences[:])
1119 occ2 = sorted(other.occurrences[:])
1120 if occ1 > occ2:
1121 return 1
1122 if occ1 < occ2:
1123 return -1
1124 # Compare context
1125 msgctxt = self.msgctxt or '0'
1126 othermsgctxt = other.msgctxt or '0'
1127 if msgctxt > othermsgctxt:
1128 return 1
1129 elif msgctxt < othermsgctxt:
1130 return -1
1131 # Compare msgid_plural
1132 msgid_plural = self.msgid_plural or '0'
1133 othermsgid_plural = other.msgid_plural or '0'
1134 if msgid_plural > othermsgid_plural:
1135 return 1
1136 elif msgid_plural < othermsgid_plural:
1137 return -1
1138 # Compare msgstr_plural
1139 if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1140 msgstr_plural = list(self.msgstr_plural.values())
1141 else:
1142 msgstr_plural = []
1143 if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1144 othermsgstr_plural = list(other.msgstr_plural.values())
1145 else:
1146 othermsgstr_plural = []
1147 if msgstr_plural > othermsgstr_plural:
1148 return 1
1149 elif msgstr_plural < othermsgstr_plural:
1150 return -1
1151 # Compare msgid
1152 if self.msgid > other.msgid:
1153 return 1
1154 elif self.msgid < other.msgid:
1155 return -1
1156 # Compare msgstr
1157 if self.msgstr > other.msgstr:
1158 return 1
1159 elif self.msgstr < other.msgstr:
1160 return -1
1161 return 0
1163 def __gt__(self, other):
1164 return self.__cmp__(other) > 0
1166 def __lt__(self, other):
1167 return self.__cmp__(other) < 0
1169 def __ge__(self, other):
1170 return self.__cmp__(other) >= 0
1172 def __le__(self, other):
1173 return self.__cmp__(other) <= 0
1175 def __eq__(self, other):
1176 return self.__cmp__(other) == 0
1178 def __ne__(self, other):
1179 return self.__cmp__(other) != 0
1181 def translated(self):
1183 Returns ``True`` if the entry has been translated or ``False``
1184 otherwise.
1186 if self.obsolete or self.fuzzy:
1187 return False
1188 if self.msgstr != '':
1189 return True
1190 if self.msgstr_plural:
1191 for pos in self.msgstr_plural:
1192 if self.msgstr_plural[pos] == '':
1193 return False
1194 return True
1195 return False
1197 def merge(self, other):
1199 Merge the current entry with the given pot entry.
1201 self.msgid = other.msgid
1202 self.msgctxt = other.msgctxt
1203 self.occurrences = other.occurrences
1204 self.comment = other.comment
1205 fuzzy = self.fuzzy
1206 self.flags = other.flags[:] # clone flags
1207 if fuzzy:
1208 self.flags.append('fuzzy')
1209 self.msgid_plural = other.msgid_plural
1210 self.obsolete = other.obsolete
1211 self.previous_msgctxt = other.previous_msgctxt
1212 self.previous_msgid = other.previous_msgid
1213 self.previous_msgid_plural = other.previous_msgid_plural
1214 if other.msgstr_plural:
1215 for pos in other.msgstr_plural:
1216 try:
1217 # keep existing translation at pos if any
1218 self.msgstr_plural[pos]
1219 except KeyError:
1220 self.msgstr_plural[pos] = ''
1222 @property
1223 def fuzzy(self):
1224 return 'fuzzy' in self.flags
1226 def __hash__(self):
1227 return hash((self.msgid, self.msgstr))
1230 # }}}
1231 # class MOEntry {{{
1234 class MOEntry(_BaseEntry):
1236 Represents a mo file entry.
1239 def __init__(self, *args, **kwargs):
1241 Constructor, accepts the following keyword arguments,
1242 for consistency with :class:`~polib.POEntry`:
1244 ``comment``
1245 ``tcomment``
1246 ``occurrences``
1247 ``flags``
1248 ``previous_msgctxt``
1249 ``previous_msgid``
1250 ``previous_msgid_plural``
1252 Note: even though these keyword arguments are accepted,
1253 they hold no real meaning in the context of MO files
1254 and are simply ignored.
1256 _BaseEntry.__init__(self, *args, **kwargs)
1257 self.comment = ''
1258 self.tcomment = ''
1259 self.occurrences = []
1260 self.flags = []
1261 self.previous_msgctxt = None
1262 self.previous_msgid = None
1263 self.previous_msgid_plural = None
1265 def __hash__(self):
1266 return hash((self.msgid, self.msgstr))
1269 # }}}
1270 # class _POFileParser {{{
1273 class _POFileParser:
1275 A finite state machine to parse efficiently and correctly po
1276 file format.
1279 # pylint: disable=redefined-outer-name
1280 def __init__(self, pofile, *_args, **kwargs):
1282 Constructor.
1284 Keyword arguments:
1286 ``pofile``
1287 string, path to the po file or its content
1289 ``encoding``
1290 string, the encoding to use, defaults to ``default_encoding``
1291 global variable (optional).
1293 ``check_for_duplicates``
1294 whether to check for duplicate entries when adding entries to the
1295 file (optional, default: ``False``).
1297 enc = kwargs.get('encoding', default_encoding)
1298 if _is_file(pofile):
1299 try:
1300 self.fhandle = open(pofile, encoding=enc)
1301 except LookupError:
1302 enc = default_encoding
1303 self.fhandle = open(pofile, encoding=enc)
1304 else:
1305 self.fhandle = pofile.splitlines()
1307 klass = kwargs.get('klass')
1308 if klass is None:
1309 klass = POFile
1310 self.instance = klass(
1311 pofile=pofile,
1312 encoding=enc,
1313 check_for_duplicates=kwargs.get('check_for_duplicates', False),
1315 self.transitions = {}
1316 self.current_line = 0
1317 self.current_entry = POEntry(linenum=self.current_line)
1318 self.current_state = 'st'
1319 self.current_token = None
1320 # two memo flags used in handlers
1321 self.msgstr_index = 0
1322 self.entry_obsolete = 0
1323 # Configure the state machine, by adding transitions.
1324 # Signification of symbols:
1325 # * ST: Beginning of the file (start)
1326 # * HE: Header
1327 # * TC: a translation comment
1328 # * GC: a generated comment
1329 # * OC: a file/line occurrence
1330 # * FL: a flags line
1331 # * CT: a message context
1332 # * PC: a previous msgctxt
1333 # * PM: a previous msgid
1334 # * PP: a previous msgid_plural
1335 # * MI: a msgid
1336 # * MP: a msgid plural
1337 # * MS: a msgstr
1338 # * MX: a msgstr plural
1339 # * MC: a msgid or msgstr continuation line
1340 # pylint: disable=redefined-builtin
1341 all = [
1342 'st',
1343 'he',
1344 'gc',
1345 'oc',
1346 'fl',
1347 'ct',
1348 'pc',
1349 'pm',
1350 'pp',
1351 'tc',
1352 'ms',
1353 'mp',
1354 'mx',
1355 'mi',
1358 self.add('tc', ['st', 'he'], 'he')
1359 self.add(
1360 'tc',
1361 ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mp', 'mx', 'mi'],
1362 'tc',
1364 self.add('gc', all, 'gc')
1365 self.add('oc', all, 'oc')
1366 self.add('fl', all, 'fl')
1367 self.add('pc', all, 'pc')
1368 self.add('pm', all, 'pm')
1369 self.add('pp', all, 'pp')
1370 self.add(
1371 'ct',
1372 ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1373 'ct',
1375 self.add(
1376 'mi',
1377 ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1378 'mi',
1380 self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1381 self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1382 self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1383 self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1385 # pylint: disable=too-many-branches
1386 def parse(self):
1388 Run the state machine, parse the file line by line and call process()
1389 with the current matched symbol.
1392 keywords = {
1393 'msgctxt': 'ct',
1394 'msgid': 'mi',
1395 'msgstr': 'ms',
1396 'msgid_plural': 'mp',
1398 prev_keywords = {
1399 'msgid_plural': 'pp',
1400 'msgid': 'pm',
1401 'msgctxt': 'pc',
1403 tokens = []
1404 fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1405 for line in self.fhandle:
1406 self.current_line += 1
1407 if self.current_line == 1:
1408 BOM = codecs.BOM_UTF8.decode('utf-8')
1409 if line.startswith(BOM):
1410 line = line[len(BOM) :]
1411 line = line.strip()
1412 if line == '':
1413 continue
1415 tokens = line.split(None, 2)
1416 nb_tokens = len(tokens)
1418 if tokens[0] == '#~|':
1419 continue
1421 if tokens[0] == '#~' and nb_tokens > 1:
1422 line = line[3:].strip()
1423 tokens = tokens[1:]
1424 nb_tokens -= 1
1425 self.entry_obsolete = 1
1426 else:
1427 self.entry_obsolete = 0
1429 # Take care of keywords like
1430 # msgid, msgid_plural, msgctxt & msgstr.
1431 if tokens[0] in keywords and nb_tokens > 1:
1432 line = line[len(tokens[0]) :].lstrip()
1433 if re.search(r'([^\\]|^)"', line[1:-1]):
1434 raise OSError(
1435 'Syntax error in po file %s(line %s): '
1436 'unescaped double quote found' % (fpath, self.current_line)
1438 self.current_token = line
1439 self.process(keywords[tokens[0]])
1440 continue
1442 self.current_token = line
1444 if tokens[0] == '#:':
1445 if nb_tokens <= 1:
1446 continue
1447 # we are on a occurrences line
1448 self.process('oc')
1450 elif line[:1] == '"':
1451 # we are on a continuation line
1452 if re.search(r'([^\\]|^)"', line[1:-1]):
1453 raise OSError(
1454 'Syntax error in po file %s(line %s): '
1455 'unescaped double quote found' % (fpath, self.current_line)
1457 self.process('mc')
1459 elif line[:7] == 'msgstr[':
1460 # we are on a msgstr plural
1461 self.process('mx')
1463 elif tokens[0] == '#,':
1464 if nb_tokens <= 1:
1465 continue
1466 # we are on a flags line
1467 self.process('fl')
1469 elif tokens[0] == '#' or tokens[0].startswith('##'):
1470 if line == '#':
1471 line += ' '
1472 # we are on a translator comment line
1473 self.process('tc')
1475 elif tokens[0] == '#.':
1476 if nb_tokens <= 1:
1477 continue
1478 # we are on a generated comment line
1479 self.process('gc')
1481 elif tokens[0] == '#|':
1482 if nb_tokens <= 1:
1483 raise OSError(
1484 'Syntax error in po file %s(line %s)'
1485 % (fpath, self.current_line)
1488 # Remove the marker and any whitespace right after that.
1489 line = line[2:].lstrip()
1490 self.current_token = line
1492 if tokens[1].startswith('"'):
1493 # Continuation of previous metadata.
1494 self.process('mc')
1495 continue
1497 if nb_tokens == 2:
1498 # Invalid continuation line.
1499 raise OSError(
1500 'Syntax error in po file %s(line %s): '
1501 'invalid continuation line' % (fpath, self.current_line)
1504 # we are on a "previous translation" comment line,
1505 if tokens[1] not in prev_keywords:
1506 # Unknown keyword in previous translation comment.
1507 raise OSError(
1508 'Syntax error in po file %s(line %s): '
1509 'unknown keyword %s' % (fpath, self.current_line, tokens[1])
1512 # Remove the keyword and any whitespace
1513 # between it and the starting quote.
1514 line = line[len(tokens[1]) :].lstrip()
1515 self.current_token = line
1516 self.process(prev_keywords[tokens[1]])
1518 else:
1519 raise OSError(
1520 f'Syntax error in po file {fpath}(line {self.current_line})'
1523 if self.current_entry and len(tokens) > 0 and not tokens[0].startswith('#'):
1524 # since entries are added when another entry is found, we must add
1525 # the last entry here (only if there are lines). Trailing comments
1526 # are ignored
1527 self.instance.append(self.current_entry)
1529 # before returning the instance, check if there's metadata and if
1530 # so extract it in a dict
1531 metadataentry = self.instance.find('')
1532 if metadataentry: # metadata found
1533 # remove the entry
1534 self.instance.remove(metadataentry)
1535 self.instance.metadata_is_fuzzy = metadataentry.flags
1536 key = None
1537 for msg in metadataentry.msgstr.splitlines():
1538 try:
1539 key, val = msg.split(':', 1)
1540 self.instance.metadata[key] = val.strip()
1541 except (ValueError, KeyError):
1542 if key is not None:
1543 self.instance.metadata[key] += '\n' + msg.strip()
1544 # close opened file
1545 if not isinstance(self.fhandle, list): # must be file
1546 self.fhandle.close()
1547 return self.instance
1549 def add(self, symbol, states, next_state):
1551 Add a transition to the state machine.
1553 Keywords arguments:
1555 ``symbol``
1556 string, the matched token (two chars symbol).
1558 ``states``
1559 list, a list of states (two chars symbols).
1561 ``next_state``
1562 the next state the fsm will have after the action.
1564 for state in states:
1565 action = getattr(self, 'handle_%s' % next_state)
1566 self.transitions[(symbol, state)] = (action, next_state)
1568 def process(self, symbol):
1570 Process the transition corresponding to the current state and the
1571 symbol provided.
1573 Keywords arguments:
1575 ``symbol``
1576 string, the matched token (two chars symbol).
1578 ``linenum``
1579 integer, the current line number of the parsed file.
1581 try:
1582 (action, state) = self.transitions[(symbol, self.current_state)]
1583 if action():
1584 self.current_state = state
1585 except Exception:
1586 fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1587 if hasattr(self.fhandle, 'close'):
1588 self.fhandle.close()
1589 raise OSError(f'Syntax error in po file {fpath}(line {self.current_line})')
1591 # state handlers
1593 def handle_he(self):
1594 """Handle a header comment."""
1595 if self.instance.header != '':
1596 self.instance.header += '\n'
1597 self.instance.header += self.current_token[2:]
1598 return 1
1600 def handle_tc(self):
1601 """Handle a translator comment."""
1602 if self.current_state in ['mc', 'ms', 'mx']:
1603 self.instance.append(self.current_entry)
1604 self.current_entry = POEntry(linenum=self.current_line)
1605 if self.current_entry.tcomment != '':
1606 self.current_entry.tcomment += '\n'
1607 tcomment = self.current_token.lstrip('#')
1608 if tcomment.startswith(' '):
1609 tcomment = tcomment[1:]
1610 self.current_entry.tcomment += tcomment
1611 return True
1613 def handle_gc(self):
1614 """Handle a generated comment."""
1615 if self.current_state in ['mc', 'ms', 'mx']:
1616 self.instance.append(self.current_entry)
1617 self.current_entry = POEntry(linenum=self.current_line)
1618 if self.current_entry.comment != '':
1619 self.current_entry.comment += '\n'
1620 self.current_entry.comment += self.current_token[3:]
1621 return True
1623 def handle_oc(self):
1624 """Handle a file:num occurrence."""
1625 if self.current_state in ['mc', 'ms', 'mx']:
1626 self.instance.append(self.current_entry)
1627 self.current_entry = POEntry(linenum=self.current_line)
1628 occurrences = self.current_token[3:].split()
1629 for occurrence in occurrences:
1630 if occurrence != '':
1631 try:
1632 fil, line = occurrence.rsplit(':', 1)
1633 if not line.isdigit():
1634 fil = occurrence
1635 line = ''
1636 self.current_entry.occurrences.append((fil, line))
1637 except (ValueError, AttributeError):
1638 self.current_entry.occurrences.append((occurrence, ''))
1639 return True
1641 def handle_fl(self):
1642 """Handle a flags line."""
1643 if self.current_state in ['mc', 'ms', 'mx']:
1644 self.instance.append(self.current_entry)
1645 self.current_entry = POEntry(linenum=self.current_line)
1646 self.current_entry.flags += [
1647 c.strip() for c in self.current_token[3:].split(',')
1649 return True
1651 def handle_pp(self):
1652 """Handle a previous msgid_plural line."""
1653 if self.current_state in ['mc', 'ms', 'mx']:
1654 self.instance.append(self.current_entry)
1655 self.current_entry = POEntry(linenum=self.current_line)
1656 self.current_entry.previous_msgid_plural = unescape(self.current_token[1:-1])
1657 return True
1659 def handle_pm(self):
1660 """Handle a previous msgid line."""
1661 if self.current_state in ['mc', 'ms', 'mx']:
1662 self.instance.append(self.current_entry)
1663 self.current_entry = POEntry(linenum=self.current_line)
1664 self.current_entry.previous_msgid = unescape(self.current_token[1:-1])
1665 return True
1667 def handle_pc(self):
1668 """Handle a previous msgctxt line."""
1669 if self.current_state in ['mc', 'ms', 'mx']:
1670 self.instance.append(self.current_entry)
1671 self.current_entry = POEntry(linenum=self.current_line)
1672 self.current_entry.previous_msgctxt = unescape(self.current_token[1:-1])
1673 return True
1675 def handle_ct(self):
1676 """Handle a msgctxt."""
1677 if self.current_state in ['mc', 'ms', 'mx']:
1678 self.instance.append(self.current_entry)
1679 self.current_entry = POEntry(linenum=self.current_line)
1680 self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1681 return True
1683 def handle_mi(self):
1684 """Handle a msgid."""
1685 if self.current_state in ['mc', 'ms', 'mx']:
1686 self.instance.append(self.current_entry)
1687 self.current_entry = POEntry(linenum=self.current_line)
1688 self.current_entry.obsolete = self.entry_obsolete
1689 self.current_entry.msgid = unescape(self.current_token[1:-1])
1690 return True
1692 def handle_mp(self):
1693 """Handle a msgid plural."""
1694 self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1695 return True
1697 def handle_ms(self):
1698 """Handle a msgstr."""
1699 self.current_entry.msgstr = unescape(self.current_token[1:-1])
1700 return True
1702 def handle_mx(self):
1703 """Handle a msgstr plural."""
1704 index = self.current_token[7]
1705 value = self.current_token[self.current_token.find('"') + 1 : -1]
1706 self.current_entry.msgstr_plural[int(index)] = unescape(value)
1707 self.msgstr_index = int(index)
1708 return True
1710 def handle_mc(self):
1711 """Handle a msgid or msgstr continuation line."""
1712 token = unescape(self.current_token[1:-1])
1713 if self.current_state == 'ct':
1714 self.current_entry.msgctxt += token
1715 elif self.current_state == 'mi':
1716 self.current_entry.msgid += token
1717 elif self.current_state == 'mp':
1718 self.current_entry.msgid_plural += token
1719 elif self.current_state == 'ms':
1720 self.current_entry.msgstr += token
1721 elif self.current_state == 'mx':
1722 self.current_entry.msgstr_plural[self.msgstr_index] += token
1723 elif self.current_state == 'pp':
1724 self.current_entry.previous_msgid_plural += token
1725 elif self.current_state == 'pm':
1726 self.current_entry.previous_msgid += token
1727 elif self.current_state == 'pc':
1728 self.current_entry.previous_msgctxt += token
1729 # don't change the current state
1730 return False
1733 # }}}
1734 # class _MOFileParser {{{
1737 class _MOFileParser:
1739 A class to parse binary mo files.
1742 # pylint: disable=unused-argument,redefined-outer-name
1743 def __init__(self, mofile, *_args, **kwargs):
1745 Constructor.
1747 Keyword arguments:
1749 ``mofile``
1750 string, path to the mo file or its content
1752 ``encoding``
1753 string, the encoding to use, defaults to ``default_encoding``
1754 global variable (optional).
1756 ``check_for_duplicates``
1757 whether to check for duplicate entries when adding entries to the
1758 file (optional, default: ``False``).
1760 if _is_file(mofile):
1761 self.fhandle = open(mofile, 'rb')
1762 else:
1763 self.fhandle = io.BytesIO(mofile)
1765 klass = kwargs.get('klass')
1766 if klass is None:
1767 klass = MOFile
1768 self.instance = klass(
1769 fpath=mofile,
1770 encoding=kwargs.get('encoding', default_encoding),
1771 check_for_duplicates=kwargs.get('check_for_duplicates', False),
1774 def __del__(self):
1776 Make sure the file is closed, this prevents warnings on unclosed file
1777 when running tests with python >= 3.2.
1779 if self.fhandle and hasattr(self.fhandle, 'close'):
1780 self.fhandle.close()
1782 def parse(self):
1784 Build the instance with the file handle provided in the
1785 constructor.
1787 # parse magic number
1788 magic_number = self._readbinary('<I', 4)
1789 if magic_number == MOFile.MAGIC:
1790 ii = '<II'
1791 elif magic_number == MOFile.MAGIC_SWAPPED:
1792 ii = '>II'
1793 else:
1794 raise OSError('Invalid mo file, magic number is incorrect !')
1795 self.instance.magic_number = magic_number
1796 # parse the version number and the number of strings
1797 version, numofstrings = self._readbinary(ii, 8)
1798 # from MO file format specs: "A program seeing an unexpected major
1799 # revision number should stop reading the MO file entirely"
1800 if version >> 16 not in (0, 1):
1801 raise OSError('Invalid mo file, unexpected major revision number')
1802 self.instance.version = version
1803 # original strings and translation strings hash table offset
1804 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1805 # move to msgid hash table and read length and offset of msgids
1806 self.fhandle.seek(msgids_hash_offset)
1807 msgids_index = []
1808 for i in range(numofstrings):
1809 msgids_index.append(self._readbinary(ii, 8))
1810 # move to msgstr hash table and read length and offset of msgstrs
1811 self.fhandle.seek(msgstrs_hash_offset)
1812 msgstrs_index = []
1813 for i in range(numofstrings):
1814 msgstrs_index.append(self._readbinary(ii, 8))
1815 # build entries
1816 encoding = self.instance.encoding
1817 for i in range(numofstrings):
1818 self.fhandle.seek(msgids_index[i][1])
1819 msgid = self.fhandle.read(msgids_index[i][0])
1821 self.fhandle.seek(msgstrs_index[i][1])
1822 msgstr = self.fhandle.read(msgstrs_index[i][0])
1823 if i == 0 and not msgid: # metadata
1824 raw_metadata, metadata = msgstr.split(b('\n')), {}
1825 for line in raw_metadata:
1826 tokens = line.split(b(':'), 1)
1827 if tokens[0] != b(''):
1828 try:
1829 k = tokens[0].decode(encoding)
1830 v = tokens[1].decode(encoding)
1831 metadata[k] = v.strip()
1832 except IndexError:
1833 metadata[k] = u('')
1834 self.instance.metadata = metadata
1835 continue
1836 # test if we have a plural entry
1837 msgid_tokens = msgid.split(b('\0'))
1838 if len(msgid_tokens) > 1:
1839 entry = self._build_entry(
1840 msgid=msgid_tokens[0],
1841 msgid_plural=msgid_tokens[1],
1842 msgstr_plural=dict(enumerate(msgstr.split(b('\x00')))),
1844 else:
1845 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1846 self.instance.append(entry)
1847 # close opened file
1848 self.fhandle.close()
1849 return self.instance
1851 def _build_entry(self, msgid, msgstr=None, msgid_plural=None, msgstr_plural=None):
1852 msgctxt_msgid = msgid.split(b('\x04'))
1853 encoding = self.instance.encoding
1854 if len(msgctxt_msgid) > 1:
1855 kwargs = {
1856 'msgctxt': msgctxt_msgid[0].decode(encoding),
1857 'msgid': msgctxt_msgid[1].decode(encoding),
1859 else:
1860 kwargs = {'msgid': msgid.decode(encoding)}
1861 if msgstr:
1862 kwargs['msgstr'] = msgstr.decode(encoding)
1863 if msgid_plural:
1864 kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1865 if msgstr_plural:
1866 for k in msgstr_plural:
1867 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1868 kwargs['msgstr_plural'] = msgstr_plural
1869 return MOEntry(**kwargs)
1871 def _readbinary(self, fmt, numbytes):
1873 Private method that unpack n bytes of data using format <fmt>.
1874 It returns a tuple or a mixed value if the tuple length is 1.
1876 content = self.fhandle.read(numbytes)
1877 tup = struct.unpack(fmt, content)
1878 if len(tup) == 1:
1879 return tup[0]
1880 return tup
1883 # }}}