polib: local tweaks for pylint compliance
[git-cola.git] / cola / polib.py
blobf6bc0309ed34da944d9c7bb2e372329b67f03976
1 # -* coding: utf-8 -*-
3 # License: MIT (see LICENSE file provided)
4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
6 """
7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
8 mo files). You can load existing files, iterate through it's entries, add,
9 modify entries, comments or metadata, etc. or create new po files from scratch.
11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
12 :func:`~polib.mofile` convenience functions.
13 """
14 from __future__ import absolute_import, division, print_function
15 import array
16 import codecs
17 import os
18 import re
19 import struct
20 import sys
21 import textwrap
22 import io
24 from . import compat
27 __author__ = 'David Jean Louis <izimobil@gmail.com>'
28 __version__ = '1.1.1'
29 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
30 'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
33 # the default encoding to use when encoding cannot be detected
34 default_encoding = 'utf-8'
36 # python 2/3 compatibility helpers {{{
39 if sys.version_info < (3,):
40 PY3 = False
41 text_type = compat.ustr
43 def b(s):
44 return s
46 def u(s):
47 return compat.ustr(s, "unicode_escape")
49 else:
50 PY3 = True
51 text_type = str
53 def b(s):
54 return s.encode("utf-8")
56 def u(s):
57 return s
58 # }}}
59 # _pofile_or_mofile {{{
62 def _pofile_or_mofile(f, filetype, **kwargs):
63 """
64 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
65 honor the DRY concept.
66 """
67 # get the file encoding
68 enc = kwargs.get('encoding')
69 if enc is None:
70 enc = detect_encoding(f, filetype == 'mofile')
72 # parse the file
73 kls = _POFileParser if filetype == 'pofile' else _MOFileParser
74 parser = kls(
76 encoding=enc,
77 check_for_duplicates=kwargs.get('check_for_duplicates', False),
78 klass=kwargs.get('klass')
80 instance = parser.parse()
81 instance.wrapwidth = kwargs.get('wrapwidth', 78)
82 return instance
83 # }}}
84 # _is_file {{{
87 def _is_file(filename_or_contents):
88 """
89 Safely returns the value of os.path.exists(filename_or_contents).
91 Arguments:
93 ``filename_or_contents``
94 either a filename, or a string holding the contents of some file.
95 In the latter case, this function will always return False.
96 """
97 try:
98 return os.path.isfile(filename_or_contents)
99 except (TypeError, ValueError, UnicodeEncodeError):
100 return False
101 # }}}
102 # function pofile() {{{
105 # pylint: disable=redefined-outer-name
106 def pofile(pofile, **kwargs):
108 Convenience function that parses the po or pot file ``pofile`` and returns
109 a :class:`~polib.POFile` instance.
111 Arguments:
113 ``pofile``
114 string, full or relative path to the po/pot file or its content (data).
116 ``wrapwidth``
117 integer, the wrap width, only useful when the ``-w`` option was passed
118 to xgettext (optional, default: ``78``).
120 ``encoding``
121 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
122 encoding will be auto-detected).
124 ``check_for_duplicates``
125 whether to check for duplicate entries when adding entries to the
126 file (optional, default: ``False``).
128 ``klass``
129 class which is used to instantiate the return value (optional,
130 default: ``None``, the return value with be a :class:`~polib.POFile`
131 instance).
133 return _pofile_or_mofile(pofile, 'pofile', **kwargs)
134 # }}}
135 # function mofile() {{{
138 # pylint: disable=redefined-outer-name
139 def mofile(mofile, **kwargs):
141 Convenience function that parses the mo file ``mofile`` and returns a
142 :class:`~polib.MOFile` instance.
144 Arguments:
146 ``mofile``
147 string, full or relative path to the mo file or its content (string
148 or bytes).
150 ``wrapwidth``
151 integer, the wrap width, only useful when the ``-w`` option was passed
152 to xgettext to generate the po file that was used to format the mo file
153 (optional, default: ``78``).
155 ``encoding``
156 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
157 encoding will be auto-detected).
159 ``check_for_duplicates``
160 whether to check for duplicate entries when adding entries to the
161 file (optional, default: ``False``).
163 ``klass``
164 class which is used to instantiate the return value (optional,
165 default: ``None``, the return value with be a :class:`~polib.POFile`
166 instance).
168 return _pofile_or_mofile(mofile, 'mofile', **kwargs)
169 # }}}
170 # function detect_encoding() {{{
173 def detect_encoding(file, binary_mode=False):
175 Try to detect the encoding used by the ``file``. The ``file`` argument can
176 be a PO or MO file path or a string containing the contents of the file.
177 If the encoding cannot be detected, the function will return the value of
178 ``default_encoding``.
180 Arguments:
182 ``file``
183 string, full or relative path to the po/mo file or its content.
185 ``binary_mode``
186 boolean, set this to True if ``file`` is a mo file.
188 PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
189 rxt = re.compile(u(PATTERN))
190 rxb = re.compile(b(PATTERN))
192 def charset_exists(charset):
193 """Check whether ``charset`` is valid or not."""
194 try:
195 codecs.lookup(charset)
196 except LookupError:
197 return False
198 return True
200 if not _is_file(file):
201 try:
202 match = rxt.search(file)
203 except TypeError:
204 match = rxb.search(file)
205 if match:
206 enc = match.group(1).strip()
207 if not isinstance(enc, text_type):
208 enc = enc.decode('utf-8')
209 if charset_exists(enc):
210 return enc
211 else:
212 # For PY3, always treat as binary
213 if binary_mode or PY3:
214 mode = 'rb'
215 rx = rxb
216 else:
217 mode = 'r'
218 rx = rxt
219 f = open(file, mode)
220 for line in f.readlines():
221 match = rx.search(line)
222 if match:
223 f.close()
224 enc = match.group(1).strip()
225 if not isinstance(enc, text_type):
226 enc = enc.decode('utf-8')
227 if charset_exists(enc):
228 return enc
229 f.close()
230 return default_encoding
231 # }}}
232 # function escape() {{{
235 def escape(st):
237 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
238 the given string ``st`` and returns it.
240 return st.replace('\\', r'\\')\
241 .replace('\t', r'\t')\
242 .replace('\r', r'\r')\
243 .replace('\n', r'\n')\
244 .replace('\"', r'\"')
245 # }}}
246 # function unescape() {{{
249 def unescape(st):
251 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
252 the given string ``st`` and returns it.
254 def unescape_repl(m):
255 m = m.group(1)
256 if m == 'n':
257 return '\n'
258 if m == 't':
259 return '\t'
260 if m == 'r':
261 return '\r'
262 if m == '\\':
263 return '\\'
264 return m # handles escaped double quote
265 return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
266 # }}}
267 # function natural_sort() {{{
270 def natural_sort(lst):
272 Sort naturally the given list.
273 Credits: http://stackoverflow.com/a/4836734
275 def convert(text):
276 return int(text) if text.isdigit() else text.lower()
278 def alphanum_key(key):
279 return [convert(c) for c in re.split('([0-9]+)', key)]
281 return sorted(lst, key=alphanum_key)
283 # }}}
284 # class _BaseFile {{{
287 class _BaseFile(list):
289 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
290 classes. This class should **not** be instantiated directly.
293 def __init__(self, *_args, **kwargs):
295 Constructor, accepts the following keyword arguments:
297 ``pofile``
298 string, the path to the po or mo file, or its content as a string.
300 ``wrapwidth``
301 integer, the wrap width, only useful when the ``-w`` option was
302 passed to xgettext (optional, default: ``78``).
304 ``encoding``
305 string, the encoding to use, defaults to ``default_encoding``
306 global variable (optional).
308 ``check_for_duplicates``
309 whether to check for duplicate entries when adding entries to the
310 file, (optional, default: ``False``).
312 list.__init__(self)
313 # the opened file handle
314 pofile = kwargs.get('pofile', None) # pylint: disable=redefined-outer-name
315 if pofile and _is_file(pofile):
316 self.fpath = pofile
317 else:
318 self.fpath = kwargs.get('fpath')
319 # the width at which lines should be wrapped
320 self.wrapwidth = kwargs.get('wrapwidth', 78)
321 # the file encoding
322 self.encoding = kwargs.get('encoding', default_encoding)
323 # whether to check for duplicate entries or not
324 self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
325 # header
326 self.header = ''
327 # both po and mo files have metadata
328 self.metadata = {}
329 self.metadata_is_fuzzy = 0
331 def __unicode__(self):
333 Returns the unicode representation of the file.
335 ret = []
336 entries = [self.metadata_as_entry()] + \
337 [e for e in self if not e.obsolete]
338 for entry in entries:
339 ret.append(entry.__unicode__(self.wrapwidth))
340 for entry in self.obsolete_entries(): # pylint: disable=no-member
341 ret.append(entry.__unicode__(self.wrapwidth))
342 ret = u('\n').join(ret)
343 return ret
345 if PY3:
346 def __str__(self):
347 return self.__unicode__()
348 else:
349 def __str__(self):
351 Returns the string representation of the file.
353 return compat.ustr(self).encode(self.encoding)
355 def __contains__(self, entry):
357 Overridden ``list`` method to implement the membership test (in and
358 not in).
359 The method considers that an entry is in the file if it finds an entry
360 that has the same msgid (the test is **case sensitive**) and the same
361 msgctxt (or none for both entries).
363 Argument:
365 ``entry``
366 an instance of :class:`~polib._BaseEntry`.
368 return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
369 is not None
371 def __eq__(self, other):
372 return str(self) == str(other)
374 def __hash__(self):
375 return hash(str(self))
377 def append(self, entry):
379 Overridden method to check for duplicates entries, if a user tries to
380 add an entry that is already in the file, the method will raise a
381 ``ValueError`` exception.
383 Argument:
385 ``entry``
386 an instance of :class:`~polib._BaseEntry`.
388 # check_for_duplicates may not be defined (yet) when unpickling.
389 # But if pickling, we never want to check for duplicates anyway.
390 if getattr(self, 'check_for_duplicates', False) and entry in self:
391 raise ValueError('Entry "%s" already exists' % entry.msgid)
392 super(_BaseFile, self).append(entry)
394 def insert(self, index, entry):
396 Overridden method to check for duplicates entries, if a user tries to
397 add an entry that is already in the file, the method will raise a
398 ``ValueError`` exception.
400 Arguments:
402 ``index``
403 index at which the entry should be inserted.
405 ``entry``
406 an instance of :class:`~polib._BaseEntry`.
408 if self.check_for_duplicates and entry in self:
409 raise ValueError('Entry "%s" already exists' % entry.msgid)
410 super(_BaseFile, self).insert(index, entry)
412 def metadata_as_entry(self):
414 Returns the file metadata as a :class:`~polib.POFile` instance.
416 e = POEntry(msgid='')
417 mdata = self.ordered_metadata()
418 if mdata:
419 strs = []
420 for name, value in mdata:
421 # Strip whitespace off each line in a multi-line entry
422 strs.append('%s: %s' % (name, value))
423 e.msgstr = '\n'.join(strs) + '\n'
424 if self.metadata_is_fuzzy:
425 e.flags.append('fuzzy')
426 return e
428 def save(self, fpath=None, repr_method='__unicode__', newline=None):
430 Saves the po file to ``fpath``.
431 If it is an existing file and no ``fpath`` is provided, then the
432 existing file is rewritten with the modified data.
434 Keyword arguments:
436 ``fpath``
437 string, full or relative path to the file.
439 ``repr_method``
440 string, the method to use for output.
442 ``newline``
443 string, controls how universal newlines works
445 if self.fpath is None and fpath is None:
446 raise IOError('You must provide a file path to save() method')
447 contents = getattr(self, repr_method)()
448 if fpath is None:
449 fpath = self.fpath
450 if repr_method == 'to_binary':
451 fhandle = open(fpath, 'wb')
452 else:
453 fhandle = io.open(
454 fpath,
455 'w',
456 encoding=self.encoding,
457 newline=newline
459 if not isinstance(contents, text_type):
460 contents = contents.decode(self.encoding)
461 fhandle.write(contents)
462 fhandle.close()
463 # set the file path if not set
464 if self.fpath is None and fpath:
465 self.fpath = fpath
467 def find(self, st, by='msgid', include_obsolete_entries=False,
468 msgctxt=False):
470 Find the entry which msgid (or property identified by the ``by``
471 argument) matches the string ``st``.
473 Keyword arguments:
475 ``st``
476 string, the string to search for.
478 ``by``
479 string, the property to use for comparison (default: ``msgid``).
481 ``include_obsolete_entries``
482 boolean, whether to also search in entries that are obsolete.
484 ``msgctxt``
485 string, allows specifying a specific message context for the
486 search.
488 if include_obsolete_entries:
489 entries = self[:]
490 else:
491 entries = [e for e in self if not e.obsolete]
492 matches = []
493 for e in entries:
494 if getattr(e, by) == st:
495 if msgctxt is not False and e.msgctxt != msgctxt:
496 continue
497 matches.append(e)
498 if len(matches) == 1:
499 return matches[0]
500 elif len(matches) > 1:
501 if not msgctxt:
502 # find the entry with no msgctx
503 e = None
504 for m in matches:
505 if not m.msgctxt:
506 e = m
507 if e:
508 return e
509 # fallback to the first entry found
510 return matches[0]
511 return None
513 def ordered_metadata(self):
515 Convenience method that returns an ordered version of the metadata
516 dictionary. The return value is list of tuples (metadata name,
517 metadata_value).
519 # copy the dict first
520 metadata = self.metadata.copy()
521 data_order = [
522 'Project-Id-Version',
523 'Report-Msgid-Bugs-To',
524 'POT-Creation-Date',
525 'PO-Revision-Date',
526 'Last-Translator',
527 'Language-Team',
528 'Language',
529 'MIME-Version',
530 'Content-Type',
531 'Content-Transfer-Encoding',
532 'Plural-Forms'
534 ordered_data = []
535 for data in data_order:
536 try:
537 value = metadata.pop(data)
538 ordered_data.append((data, value))
539 except KeyError:
540 pass
541 # the rest of the metadata will be alphabetically ordered since there
542 # are no specs for this AFAIK
543 for data in natural_sort(metadata.keys()):
544 value = metadata[data]
545 ordered_data.append((data, value))
546 return ordered_data
548 def to_binary(self):
550 Return the binary representation of the file.
552 offsets = []
553 entries = self.translated_entries() # pylint: disable=no-member
555 # the keys are sorted in the .mo file
556 def cmp(_self, other): # pylint: disable=unused-variable
557 # msgfmt compares entries with msgctxt if it exists
558 self_msgid = _self.msgctxt or _self.msgid
559 other_msgid = other.msgctxt or other.msgid
560 if self_msgid > other_msgid:
561 return 1
562 elif self_msgid < other_msgid:
563 return -1
564 else:
565 return 0
566 # add metadata entry
567 entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
568 mentry = self.metadata_as_entry()
569 entries = [mentry] + entries
570 entries_len = len(entries)
571 ids, strs = b(''), b('')
572 for e in entries:
573 # For each string, we need size and file offset. Each string is
574 # NUL terminated; the NUL does not count into the size.
575 msgid = b('')
576 if e.msgctxt:
577 # Contexts are stored by storing the concatenation of the
578 # context, a <EOT> byte, and the original string
579 msgid = self._encode(e.msgctxt + '\4')
580 if e.msgid_plural:
581 msgstr = []
582 for index in sorted(e.msgstr_plural.keys()):
583 msgstr.append(e.msgstr_plural[index])
584 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
585 msgstr = self._encode('\0'.join(msgstr))
586 else:
587 msgid += self._encode(e.msgid)
588 msgstr = self._encode(e.msgstr)
589 offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
590 ids += msgid + b('\0')
591 strs += msgstr + b('\0')
593 # The header is 7 32-bit unsigned integers.
594 keystart = 7 * 4 + 16 * entries_len
595 # and the values start after the keys
596 valuestart = keystart + len(ids)
597 koffsets = []
598 voffsets = []
599 # The string table first has the list of keys, then the list of values.
600 # Each entry has first the size of the string, then the file offset.
601 for o1, l1, o2, l2 in offsets:
602 koffsets += [l1, o1 + keystart]
603 voffsets += [l2, o2 + valuestart]
604 offsets = koffsets + voffsets
606 output = struct.pack(
607 "Iiiiiii",
608 # Magic number
609 MOFile.MAGIC,
610 # Version
612 # number of entries
613 entries_len,
614 # start of key index
615 7 * 4,
616 # start of value index
617 7 * 4 + entries_len * 8,
618 # size and offset of hash table, we don't use hash tables
619 0, keystart
622 if PY3 and sys.version_info.minor > 1: # python 3.2 or superior
623 output += array.array("i", offsets).tobytes()
624 else:
625 output += array.array("i", offsets).tostring() # pylint: disable=no-member
626 output += ids
627 output += strs
628 return output
630 def _encode(self, mixed):
632 Encodes the given ``mixed`` argument with the file encoding if and
633 only if it's an unicode string and returns the encoded string.
635 if isinstance(mixed, text_type):
636 mixed = mixed.encode(self.encoding)
637 return mixed
638 # }}}
639 # class POFile {{{
642 class POFile(_BaseFile):
644 Po (or Pot) file reader/writer.
645 This class inherits the :class:`~polib._BaseFile` class and, by extension,
646 the python ``list`` type.
649 def __unicode__(self):
651 Returns the unicode representation of the po file.
653 ret, headers = '', self.header.split('\n')
654 for header in headers:
655 if not header:
656 ret += "#\n"
657 elif header[:1] in [',', ':']:
658 ret += '#%s\n' % header
659 else:
660 ret += '# %s\n' % header
662 if not isinstance(ret, text_type):
663 ret = ret.decode(self.encoding)
665 return ret + _BaseFile.__unicode__(self)
667 def save_as_mofile(self, fpath):
669 Saves the binary representation of the file to given ``fpath``.
671 Keyword argument:
673 ``fpath``
674 string, full or relative path to the mo file.
676 _BaseFile.save(self, fpath, 'to_binary')
678 def percent_translated(self):
680 Convenience method that returns the percentage of translated
681 messages.
683 total = len([e for e in self if not e.obsolete])
684 if total == 0:
685 return 100
686 translated = len(self.translated_entries())
687 return int(translated * 100 / float(total))
689 def translated_entries(self):
691 Convenience method that returns the list of translated entries.
693 return [e for e in self if e.translated()]
695 def untranslated_entries(self):
697 Convenience method that returns the list of untranslated entries.
699 return [e for e in self if not e.translated() and not e.obsolete
700 and not e.fuzzy]
702 def fuzzy_entries(self):
704 Convenience method that returns the list of fuzzy entries.
706 return [e for e in self if e.fuzzy and not e.obsolete]
708 def obsolete_entries(self):
710 Convenience method that returns the list of obsolete entries.
712 return [e for e in self if e.obsolete]
714 def merge(self, refpot):
716 Convenience method that merges the current pofile with the pot file
717 provided. It behaves exactly as the gettext msgmerge utility:
719 * comments of this file will be preserved, but extracted comments and
720 occurrences will be discarded;
721 * any translations or comments in the file will be discarded, however,
722 dot comments and file positions will be preserved;
723 * the fuzzy flags are preserved.
725 Keyword argument:
727 ``refpot``
728 object POFile, the reference catalog.
730 # Store entries in dict/set for faster access
731 self_entries = dict(
732 (entry.msgid_with_context, entry) for entry in self
734 refpot_msgids = set(entry.msgid_with_context for entry in refpot)
735 # Merge entries that are in the refpot
736 for entry in refpot:
737 e = self_entries.get(entry.msgid_with_context)
738 if e is None:
739 e = POEntry()
740 self.append(e)
741 e.merge(entry)
742 # ok, now we must "obsolete" entries that are not in the refpot anymore
743 for entry in self:
744 if entry.msgid_with_context not in refpot_msgids:
745 entry.obsolete = True
746 # }}}
747 # class MOFile {{{
750 class MOFile(_BaseFile):
752 Mo file reader/writer.
753 This class inherits the :class:`~polib._BaseFile` class and, by
754 extension, the python ``list`` type.
756 MAGIC = 0x950412de
757 MAGIC_SWAPPED = 0xde120495
759 def __init__(self, *args, **kwargs):
761 Constructor, accepts all keywords arguments accepted by
762 :class:`~polib._BaseFile` class.
764 _BaseFile.__init__(self, *args, **kwargs)
765 self.magic_number = None
766 self.version = 0
768 def save_as_pofile(self, fpath):
770 Saves the mofile as a pofile to ``fpath``.
772 Keyword argument:
774 ``fpath``
775 string, full or relative path to the file.
777 _BaseFile.save(self, fpath)
779 # pylint: disable=no-self-use,arguments-differ
780 def save(self, fpath=None):
782 Saves the mofile to ``fpath``.
784 Keyword argument:
786 ``fpath``
787 string, full or relative path to the file.
789 _BaseFile.save(self, fpath, 'to_binary')
791 # pylint: disable=no-self-use
792 def percent_translated(self):
794 Convenience method to keep the same interface with POFile instances.
796 return 100
798 # pylint: disable=no-self-use
799 def translated_entries(self):
801 Convenience method to keep the same interface with POFile instances.
803 return self
805 # pylint: disable=no-self-use
806 def untranslated_entries(self):
808 Convenience method to keep the same interface with POFile instances.
810 return []
812 # pylint: disable=no-self-use
813 def fuzzy_entries(self):
815 Convenience method to keep the same interface with POFile instances.
817 return []
819 # pylint: disable=no-self-use
820 def obsolete_entries(self):
822 Convenience method to keep the same interface with POFile instances.
824 return []
825 # }}}
826 # class _BaseEntry {{{
829 class _BaseEntry(object):
831 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
832 This class should **not** be instantiated directly.
835 def __init__(self, *_args, **kwargs):
837 Constructor, accepts the following keyword arguments:
839 ``msgid``
840 string, the entry msgid.
842 ``msgstr``
843 string, the entry msgstr.
845 ``msgid_plural``
846 string, the entry msgid_plural.
848 ``msgstr_plural``
849 dict, the entry msgstr_plural lines.
851 ``msgctxt``
852 string, the entry context (msgctxt).
854 ``obsolete``
855 bool, whether the entry is "obsolete" or not.
857 ``encoding``
858 string, the encoding to use, defaults to ``default_encoding``
859 global variable (optional).
861 self.msgid = kwargs.get('msgid', '')
862 self.msgstr = kwargs.get('msgstr', '')
863 self.msgid_plural = kwargs.get('msgid_plural', '')
864 self.msgstr_plural = kwargs.get('msgstr_plural', {})
865 self.msgctxt = kwargs.get('msgctxt', None)
866 self.obsolete = kwargs.get('obsolete', False)
867 self.encoding = kwargs.get('encoding', default_encoding)
869 def __unicode__(self, wrapwidth=78):
871 Returns the unicode representation of the entry.
873 if self.obsolete:
874 delflag = '#~ '
875 else:
876 delflag = ''
877 ret = []
878 # write the msgctxt if any
879 if self.msgctxt is not None:
880 ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
881 wrapwidth)
882 # write the msgid
883 ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
884 # write the msgid_plural if any
885 if self.msgid_plural:
886 ret += self._str_field("msgid_plural", delflag, "",
887 self.msgid_plural, wrapwidth)
888 if self.msgstr_plural:
889 # write the msgstr_plural if any
890 msgstrs = self.msgstr_plural
891 keys = list(msgstrs)
892 keys.sort()
893 for index in keys:
894 msgstr = msgstrs[index]
895 plural_index = '[%s]' % index
896 ret += self._str_field("msgstr", delflag, plural_index, msgstr,
897 wrapwidth)
898 else:
899 # otherwise write the msgstr
900 ret += self._str_field("msgstr", delflag, "", self.msgstr,
901 wrapwidth)
902 ret.append('')
903 ret = u('\n').join(ret)
904 return ret
906 if PY3:
907 def __str__(self):
908 return self.__unicode__()
909 else:
910 def __str__(self):
912 Returns the string representation of the entry.
914 return compat.ustr(self).encode(self.encoding)
916 def __eq__(self, other):
917 return str(self) == str(other)
919 def __hash__(self):
920 return hash(str(self))
922 # pylint: disable=no-self-use
923 def _str_field(self, fieldname, delflag, plural_index, field,
924 wrapwidth=78):
925 lines = field.splitlines(True)
926 if len(lines) > 1:
927 lines = [''] + lines # start with initial empty line
928 else:
929 escaped_field = escape(field)
930 specialchars_count = 0
931 for c in ['\\', '\n', '\r', '\t', '"']:
932 specialchars_count += field.count(c)
933 # comparison must take into account fieldname length + one space
934 # + 2 quotes (eg. msgid "<string>")
935 flength = len(fieldname) + 3
936 if plural_index:
937 flength += len(plural_index)
938 real_wrapwidth = wrapwidth - flength + specialchars_count
939 if wrapwidth > 0 and len(field) > real_wrapwidth:
940 # Wrap the line but take field name into account
941 lines = [''] + [unescape(item) for item in textwrap.wrap(
942 escaped_field,
943 wrapwidth - 2, # 2 for quotes ""
944 drop_whitespace=False,
945 break_long_words=False
947 else:
948 lines = [field]
949 if fieldname.startswith('previous_'):
950 # quick and dirty trick to get the real field name
951 fieldname = fieldname[9:]
953 ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
954 escape(lines.pop(0)))]
955 for line in lines:
956 ret.append('%s"%s"' % (delflag, escape(line)))
957 return ret
959 @property
960 def msgid_with_context(self):
961 if self.msgctxt:
962 return '%s%s%s' % (self.msgctxt, "\x04", self.msgid)
963 return self.msgid
964 # }}}
965 # class POEntry {{{
968 class POEntry(_BaseEntry):
970 Represents a po file entry.
973 def __init__(self, *args, **kwargs):
975 Constructor, accepts the following keyword arguments:
977 ``comment``
978 string, the entry comment.
980 ``tcomment``
981 string, the entry translator comment.
983 ``occurrences``
984 list, the entry occurrences.
986 ``flags``
987 list, the entry flags.
989 ``previous_msgctxt``
990 string, the entry previous context.
992 ``previous_msgid``
993 string, the entry previous msgid.
995 ``previous_msgid_plural``
996 string, the entry previous msgid_plural.
998 ``linenum``
999 integer, the line number of the entry
1001 _BaseEntry.__init__(self, *args, **kwargs)
1002 self.comment = kwargs.get('comment', '')
1003 self.tcomment = kwargs.get('tcomment', '')
1004 self.occurrences = kwargs.get('occurrences', [])
1005 self.flags = kwargs.get('flags', [])
1006 self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
1007 self.previous_msgid = kwargs.get('previous_msgid', None)
1008 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
1009 self.linenum = kwargs.get('linenum', None)
1011 def __unicode__(self, wrapwidth=78):
1013 Returns the unicode representation of the entry.
1015 ret = []
1016 # comments first, if any (with text wrapping as xgettext does)
1017 if self.obsolete:
1018 comments = [('tcomment', '# ')]
1019 else:
1020 comments = [('comment', '#. '), ('tcomment', '# ')]
1021 for c in comments:
1022 val = getattr(self, c[0])
1023 if val:
1024 for comment in val.split('\n'):
1025 if len(comment) + len(c[1]) > wrapwidth > 0:
1026 ret += textwrap.wrap(
1027 comment,
1028 wrapwidth,
1029 initial_indent=c[1],
1030 subsequent_indent=c[1],
1031 break_long_words=False
1033 else:
1034 ret.append('%s%s' % (c[1], comment))
1036 # occurrences (with text wrapping as xgettext does)
1037 if not self.obsolete and self.occurrences:
1038 filelist = []
1039 for fpath, lineno in self.occurrences:
1040 if lineno:
1041 filelist.append('%s:%s' % (fpath, lineno))
1042 else:
1043 filelist.append(fpath)
1044 filestr = ' '.join(filelist)
1045 if len(filestr) + 3 > wrapwidth > 0:
1046 # textwrap split words that contain hyphen, this is not
1047 # what we want for filenames, so the dirty hack is to
1048 # temporally replace hyphens with a char that a file cannot
1049 # contain, like "*"
1050 ret += [line.replace('*', '-') for line in textwrap.wrap(
1051 filestr.replace('-', '*'),
1052 wrapwidth,
1053 initial_indent='#: ',
1054 subsequent_indent='#: ',
1055 break_long_words=False
1057 else:
1058 ret.append('#: ' + filestr)
1060 # flags (TODO: wrapping ?)
1061 if self.flags:
1062 ret.append('#, %s' % ', '.join(self.flags))
1064 # previous context and previous msgid/msgid_plural
1065 fields = ['previous_msgctxt', 'previous_msgid',
1066 'previous_msgid_plural']
1067 if self.obsolete:
1068 prefix = "#~| "
1069 else:
1070 prefix = "#| "
1071 for f in fields:
1072 val = getattr(self, f)
1073 if val is not None:
1074 ret += self._str_field(f, prefix, "", val, wrapwidth)
1076 ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1077 ret = u('\n').join(ret)
1078 return ret
1080 # pylint: disable=cmp-method,too-many-return-statements
1081 def __cmp__(self, other):
1083 Called by comparison operations if rich comparison is not defined.
1085 # First: Obsolete test
1086 if self.obsolete != other.obsolete:
1087 if self.obsolete:
1088 return -1
1089 else:
1090 return 1
1091 # Work on a copy to protect original
1092 occ1 = sorted(self.occurrences[:])
1093 occ2 = sorted(other.occurrences[:])
1094 if occ1 > occ2:
1095 return 1
1096 if occ1 < occ2:
1097 return -1
1098 # Compare context
1099 msgctxt = self.msgctxt or '0'
1100 othermsgctxt = other.msgctxt or '0'
1101 if msgctxt > othermsgctxt:
1102 return 1
1103 elif msgctxt < othermsgctxt:
1104 return -1
1105 # Compare msgid_plural
1106 msgid_plural = self.msgid_plural or '0'
1107 othermsgid_plural = other.msgid_plural or '0'
1108 if msgid_plural > othermsgid_plural:
1109 return 1
1110 elif msgid_plural < othermsgid_plural:
1111 return -1
1112 # Compare msgstr_plural
1113 if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1114 msgstr_plural = list(self.msgstr_plural.values())
1115 else:
1116 msgstr_plural = []
1117 if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1118 othermsgstr_plural = list(other.msgstr_plural.values())
1119 else:
1120 othermsgstr_plural = []
1121 if msgstr_plural > othermsgstr_plural:
1122 return 1
1123 elif msgstr_plural < othermsgstr_plural:
1124 return -1
1125 # Compare msgid
1126 if self.msgid > other.msgid:
1127 return 1
1128 elif self.msgid < other.msgid:
1129 return -1
1130 # Compare msgstr
1131 if self.msgstr > other.msgstr:
1132 return 1
1133 elif self.msgstr < other.msgstr:
1134 return -1
1135 return 0
1137 def __gt__(self, other):
1138 return self.__cmp__(other) > 0
1140 def __lt__(self, other):
1141 return self.__cmp__(other) < 0
1143 def __ge__(self, other):
1144 return self.__cmp__(other) >= 0
1146 def __le__(self, other):
1147 return self.__cmp__(other) <= 0
1149 def __eq__(self, other):
1150 return self.__cmp__(other) == 0
1152 def __ne__(self, other):
1153 return self.__cmp__(other) != 0
1155 def translated(self):
1157 Returns ``True`` if the entry has been translated or ``False``
1158 otherwise.
1160 if self.obsolete or self.fuzzy:
1161 return False
1162 if self.msgstr != '':
1163 return True
1164 if self.msgstr_plural:
1165 for pos in self.msgstr_plural:
1166 if self.msgstr_plural[pos] == '':
1167 return False
1168 return True
1169 return False
1171 def merge(self, other):
1173 Merge the current entry with the given pot entry.
1175 self.msgid = other.msgid
1176 self.msgctxt = other.msgctxt
1177 self.occurrences = other.occurrences
1178 self.comment = other.comment
1179 fuzzy = self.fuzzy
1180 self.flags = other.flags[:] # clone flags
1181 if fuzzy:
1182 self.flags.append('fuzzy')
1183 self.msgid_plural = other.msgid_plural
1184 self.obsolete = other.obsolete
1185 self.previous_msgctxt = other.previous_msgctxt
1186 self.previous_msgid = other.previous_msgid
1187 self.previous_msgid_plural = other.previous_msgid_plural
1188 if other.msgstr_plural:
1189 for pos in other.msgstr_plural:
1190 try:
1191 # keep existing translation at pos if any
1192 self.msgstr_plural[pos]
1193 except KeyError:
1194 self.msgstr_plural[pos] = ''
1196 @property
1197 def fuzzy(self):
1198 return 'fuzzy' in self.flags
1200 def __hash__(self):
1201 return hash((self.msgid, self.msgstr))
1202 # }}}
1203 # class MOEntry {{{
1206 class MOEntry(_BaseEntry):
1208 Represents a mo file entry.
1210 def __init__(self, *args, **kwargs):
1212 Constructor, accepts the following keyword arguments,
1213 for consistency with :class:`~polib.POEntry`:
1215 ``comment``
1216 ``tcomment``
1217 ``occurrences``
1218 ``flags``
1219 ``previous_msgctxt``
1220 ``previous_msgid``
1221 ``previous_msgid_plural``
1223 Note: even though these keyword arguments are accepted,
1224 they hold no real meaning in the context of MO files
1225 and are simply ignored.
1227 _BaseEntry.__init__(self, *args, **kwargs)
1228 self.comment = ''
1229 self.tcomment = ''
1230 self.occurrences = []
1231 self.flags = []
1232 self.previous_msgctxt = None
1233 self.previous_msgid = None
1234 self.previous_msgid_plural = None
1236 def __hash__(self):
1237 return hash((self.msgid, self.msgstr))
1239 # }}}
1240 # class _POFileParser {{{
1243 class _POFileParser(object):
1245 A finite state machine to parse efficiently and correctly po
1246 file format.
1249 # pylint: disable=redefined-outer-name
1250 def __init__(self, pofile, *_args, **kwargs):
1252 Constructor.
1254 Keyword arguments:
1256 ``pofile``
1257 string, path to the po file or its content
1259 ``encoding``
1260 string, the encoding to use, defaults to ``default_encoding``
1261 global variable (optional).
1263 ``check_for_duplicates``
1264 whether to check for duplicate entries when adding entries to the
1265 file (optional, default: ``False``).
1267 enc = kwargs.get('encoding', default_encoding)
1268 if _is_file(pofile):
1269 try:
1270 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1271 except LookupError:
1272 enc = default_encoding
1273 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1274 else:
1275 self.fhandle = pofile.splitlines()
1277 klass = kwargs.get('klass')
1278 if klass is None:
1279 klass = POFile
1280 self.instance = klass(
1281 pofile=pofile,
1282 encoding=enc,
1283 check_for_duplicates=kwargs.get('check_for_duplicates', False)
1285 self.transitions = {}
1286 self.current_line = 0
1287 self.current_entry = POEntry(linenum=self.current_line)
1288 self.current_state = 'st'
1289 self.current_token = None
1290 # two memo flags used in handlers
1291 self.msgstr_index = 0
1292 self.entry_obsolete = 0
1293 # Configure the state machine, by adding transitions.
1294 # Signification of symbols:
1295 # * ST: Beginning of the file (start)
1296 # * HE: Header
1297 # * TC: a translation comment
1298 # * GC: a generated comment
1299 # * OC: a file/line occurrence
1300 # * FL: a flags line
1301 # * CT: a message context
1302 # * PC: a previous msgctxt
1303 # * PM: a previous msgid
1304 # * PP: a previous msgid_plural
1305 # * MI: a msgid
1306 # * MP: a msgid plural
1307 # * MS: a msgstr
1308 # * MX: a msgstr plural
1309 # * MC: a msgid or msgstr continuation line
1310 # pylint: disable=redefined-builtin
1311 all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
1312 'ms', 'mp', 'mx', 'mi']
1314 self.add('tc', ['st', 'he'], 'he')
1315 self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
1316 'mp', 'mx', 'mi'], 'tc')
1317 self.add('gc', all, 'gc')
1318 self.add('oc', all, 'oc')
1319 self.add('fl', all, 'fl')
1320 self.add('pc', all, 'pc')
1321 self.add('pm', all, 'pm')
1322 self.add('pp', all, 'pp')
1323 self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
1324 'pp', 'ms', 'mx'], 'ct')
1325 self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
1326 'pm', 'pp', 'ms', 'mx'], 'mi')
1327 self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1328 self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1329 self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1330 self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1332 # pylint: disable=too-many-branches
1333 def parse(self):
1335 Run the state machine, parse the file line by line and call process()
1336 with the current matched symbol.
1339 keywords = {
1340 'msgctxt': 'ct',
1341 'msgid': 'mi',
1342 'msgstr': 'ms',
1343 'msgid_plural': 'mp',
1345 prev_keywords = {
1346 'msgid_plural': 'pp',
1347 'msgid': 'pm',
1348 'msgctxt': 'pc',
1350 tokens = []
1351 fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1352 for line in self.fhandle:
1353 self.current_line += 1
1354 if self.current_line == 1:
1355 BOM = codecs.BOM_UTF8.decode('utf-8')
1356 if line.startswith(BOM):
1357 line = line[len(BOM):]
1358 line = line.strip()
1359 if line == '':
1360 continue
1362 tokens = line.split(None, 2)
1363 nb_tokens = len(tokens)
1365 if tokens[0] == '#~|':
1366 continue
1368 if tokens[0] == '#~' and nb_tokens > 1:
1369 line = line[3:].strip()
1370 tokens = tokens[1:]
1371 nb_tokens -= 1
1372 self.entry_obsolete = 1
1373 else:
1374 self.entry_obsolete = 0
1376 # Take care of keywords like
1377 # msgid, msgid_plural, msgctxt & msgstr.
1378 if tokens[0] in keywords and nb_tokens > 1:
1379 line = line[len(tokens[0]):].lstrip()
1380 if re.search(r'([^\\]|^)"', line[1:-1]):
1381 raise IOError('Syntax error in po file %s(line %s): '
1382 'unescaped double quote found' %
1383 (fpath, self.current_line))
1384 self.current_token = line
1385 self.process(keywords[tokens[0]])
1386 continue
1388 self.current_token = line
1390 if tokens[0] == '#:':
1391 if nb_tokens <= 1:
1392 continue
1393 # we are on a occurrences line
1394 self.process('oc')
1396 elif line[:1] == '"':
1397 # we are on a continuation line
1398 if re.search(r'([^\\]|^)"', line[1:-1]):
1399 raise IOError('Syntax error in po file %s(line %s): '
1400 'unescaped double quote found' %
1401 (fpath, self.current_line))
1402 self.process('mc')
1404 elif line[:7] == 'msgstr[':
1405 # we are on a msgstr plural
1406 self.process('mx')
1408 elif tokens[0] == '#,':
1409 if nb_tokens <= 1:
1410 continue
1411 # we are on a flags line
1412 self.process('fl')
1414 elif tokens[0] == '#' or tokens[0].startswith('##'):
1415 if line == '#':
1416 line += ' '
1417 # we are on a translator comment line
1418 self.process('tc')
1420 elif tokens[0] == '#.':
1421 if nb_tokens <= 1:
1422 continue
1423 # we are on a generated comment line
1424 self.process('gc')
1426 elif tokens[0] == '#|':
1427 if nb_tokens <= 1:
1428 raise IOError('Syntax error in po file %s(line %s)' %
1429 (fpath, self.current_line))
1431 # Remove the marker and any whitespace right after that.
1432 line = line[2:].lstrip()
1433 self.current_token = line
1435 if tokens[1].startswith('"'):
1436 # Continuation of previous metadata.
1437 self.process('mc')
1438 continue
1440 if nb_tokens == 2:
1441 # Invalid continuation line.
1442 raise IOError('Syntax error in po file %s(line %s): '
1443 'invalid continuation line' %
1444 (fpath, self.current_line))
1446 # we are on a "previous translation" comment line,
1447 if tokens[1] not in prev_keywords:
1448 # Unknown keyword in previous translation comment.
1449 raise IOError('Syntax error in po file %s(line %s): '
1450 'unknown keyword %s' %
1451 (fpath, self.current_line,
1452 tokens[1]))
1454 # Remove the keyword and any whitespace
1455 # between it and the starting quote.
1456 line = line[len(tokens[1]):].lstrip()
1457 self.current_token = line
1458 self.process(prev_keywords[tokens[1]])
1460 else:
1461 raise IOError('Syntax error in po file %s(line %s)' %
1462 (fpath, self.current_line))
1464 if self.current_entry and len(tokens) > 0 and \
1465 not tokens[0].startswith('#'):
1466 # since entries are added when another entry is found, we must add
1467 # the last entry here (only if there are lines). Trailing comments
1468 # are ignored
1469 self.instance.append(self.current_entry)
1471 # before returning the instance, check if there's metadata and if
1472 # so extract it in a dict
1473 metadataentry = self.instance.find('')
1474 if metadataentry: # metadata found
1475 # remove the entry
1476 self.instance.remove(metadataentry)
1477 self.instance.metadata_is_fuzzy = metadataentry.flags
1478 key = None
1479 for msg in metadataentry.msgstr.splitlines():
1480 try:
1481 key, val = msg.split(':', 1)
1482 self.instance.metadata[key] = val.strip()
1483 except (ValueError, KeyError):
1484 if key is not None:
1485 self.instance.metadata[key] += '\n' + msg.strip()
1486 # close opened file
1487 if not isinstance(self.fhandle, list): # must be file
1488 self.fhandle.close()
1489 return self.instance
1491 def add(self, symbol, states, next_state):
1493 Add a transition to the state machine.
1495 Keywords arguments:
1497 ``symbol``
1498 string, the matched token (two chars symbol).
1500 ``states``
1501 list, a list of states (two chars symbols).
1503 ``next_state``
1504 the next state the fsm will have after the action.
1506 for state in states:
1507 action = getattr(self, 'handle_%s' % next_state)
1508 self.transitions[(symbol, state)] = (action, next_state)
1510 def process(self, symbol):
1512 Process the transition corresponding to the current state and the
1513 symbol provided.
1515 Keywords arguments:
1517 ``symbol``
1518 string, the matched token (two chars symbol).
1520 ``linenum``
1521 integer, the current line number of the parsed file.
1523 try:
1524 (action, state) = self.transitions[(symbol, self.current_state)]
1525 if action():
1526 self.current_state = state
1527 except Exception:
1528 fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1529 if hasattr(self.fhandle, 'close'):
1530 self.fhandle.close()
1531 raise IOError('Syntax error in po file %s(line %s)' %
1532 (fpath, self.current_line))
1534 # state handlers
1536 def handle_he(self):
1537 """Handle a header comment."""
1538 if self.instance.header != '':
1539 self.instance.header += '\n'
1540 self.instance.header += self.current_token[2:]
1541 return 1
1543 def handle_tc(self):
1544 """Handle a translator comment."""
1545 if self.current_state in ['mc', 'ms', 'mx']:
1546 self.instance.append(self.current_entry)
1547 self.current_entry = POEntry(linenum=self.current_line)
1548 if self.current_entry.tcomment != '':
1549 self.current_entry.tcomment += '\n'
1550 tcomment = self.current_token.lstrip('#')
1551 if tcomment.startswith(' '):
1552 tcomment = tcomment[1:]
1553 self.current_entry.tcomment += tcomment
1554 return True
1556 def handle_gc(self):
1557 """Handle a generated comment."""
1558 if self.current_state in ['mc', 'ms', 'mx']:
1559 self.instance.append(self.current_entry)
1560 self.current_entry = POEntry(linenum=self.current_line)
1561 if self.current_entry.comment != '':
1562 self.current_entry.comment += '\n'
1563 self.current_entry.comment += self.current_token[3:]
1564 return True
1566 def handle_oc(self):
1567 """Handle a file:num occurrence."""
1568 if self.current_state in ['mc', 'ms', 'mx']:
1569 self.instance.append(self.current_entry)
1570 self.current_entry = POEntry(linenum=self.current_line)
1571 occurrences = self.current_token[3:].split()
1572 for occurrence in occurrences:
1573 if occurrence != '':
1574 try:
1575 fil, line = occurrence.rsplit(':', 1)
1576 if not line.isdigit():
1577 fil = occurrence
1578 line = ''
1579 self.current_entry.occurrences.append((fil, line))
1580 except (ValueError, AttributeError):
1581 self.current_entry.occurrences.append((occurrence, ''))
1582 return True
1584 def handle_fl(self):
1585 """Handle a flags line."""
1586 if self.current_state in ['mc', 'ms', 'mx']:
1587 self.instance.append(self.current_entry)
1588 self.current_entry = POEntry(linenum=self.current_line)
1589 self.current_entry.flags += [c.strip() for c in
1590 self.current_token[3:].split(',')]
1591 return True
1593 def handle_pp(self):
1594 """Handle a previous msgid_plural line."""
1595 if self.current_state in ['mc', 'ms', 'mx']:
1596 self.instance.append(self.current_entry)
1597 self.current_entry = POEntry(linenum=self.current_line)
1598 self.current_entry.previous_msgid_plural = \
1599 unescape(self.current_token[1:-1])
1600 return True
1602 def handle_pm(self):
1603 """Handle a previous msgid line."""
1604 if self.current_state in ['mc', 'ms', 'mx']:
1605 self.instance.append(self.current_entry)
1606 self.current_entry = POEntry(linenum=self.current_line)
1607 self.current_entry.previous_msgid = \
1608 unescape(self.current_token[1:-1])
1609 return True
1611 def handle_pc(self):
1612 """Handle a previous msgctxt line."""
1613 if self.current_state in ['mc', 'ms', 'mx']:
1614 self.instance.append(self.current_entry)
1615 self.current_entry = POEntry(linenum=self.current_line)
1616 self.current_entry.previous_msgctxt = \
1617 unescape(self.current_token[1:-1])
1618 return True
1620 def handle_ct(self):
1621 """Handle a msgctxt."""
1622 if self.current_state in ['mc', 'ms', 'mx']:
1623 self.instance.append(self.current_entry)
1624 self.current_entry = POEntry(linenum=self.current_line)
1625 self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1626 return True
1628 def handle_mi(self):
1629 """Handle a msgid."""
1630 if self.current_state in ['mc', 'ms', 'mx']:
1631 self.instance.append(self.current_entry)
1632 self.current_entry = POEntry(linenum=self.current_line)
1633 self.current_entry.obsolete = self.entry_obsolete
1634 self.current_entry.msgid = unescape(self.current_token[1:-1])
1635 return True
1637 def handle_mp(self):
1638 """Handle a msgid plural."""
1639 self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1640 return True
1642 def handle_ms(self):
1643 """Handle a msgstr."""
1644 self.current_entry.msgstr = unescape(self.current_token[1:-1])
1645 return True
1647 def handle_mx(self):
1648 """Handle a msgstr plural."""
1649 index = self.current_token[7]
1650 value = self.current_token[self.current_token.find('"') + 1:-1]
1651 self.current_entry.msgstr_plural[int(index)] = unescape(value)
1652 self.msgstr_index = int(index)
1653 return True
1655 def handle_mc(self):
1656 """Handle a msgid or msgstr continuation line."""
1657 token = unescape(self.current_token[1:-1])
1658 if self.current_state == 'ct':
1659 self.current_entry.msgctxt += token
1660 elif self.current_state == 'mi':
1661 self.current_entry.msgid += token
1662 elif self.current_state == 'mp':
1663 self.current_entry.msgid_plural += token
1664 elif self.current_state == 'ms':
1665 self.current_entry.msgstr += token
1666 elif self.current_state == 'mx':
1667 self.current_entry.msgstr_plural[self.msgstr_index] += token
1668 elif self.current_state == 'pp':
1669 self.current_entry.previous_msgid_plural += token
1670 elif self.current_state == 'pm':
1671 self.current_entry.previous_msgid += token
1672 elif self.current_state == 'pc':
1673 self.current_entry.previous_msgctxt += token
1674 # don't change the current state
1675 return False
1676 # }}}
1677 # class _MOFileParser {{{
1680 class _MOFileParser(object):
1682 A class to parse binary mo files.
1685 # pylint: disable=unused-argument,redefined-outer-name
1686 def __init__(self, mofile, *_args, **kwargs):
1688 Constructor.
1690 Keyword arguments:
1692 ``mofile``
1693 string, path to the mo file or its content
1695 ``encoding``
1696 string, the encoding to use, defaults to ``default_encoding``
1697 global variable (optional).
1699 ``check_for_duplicates``
1700 whether to check for duplicate entries when adding entries to the
1701 file (optional, default: ``False``).
1703 if _is_file(mofile):
1704 self.fhandle = open(mofile, 'rb')
1705 else:
1706 self.fhandle = io.BytesIO(mofile)
1708 klass = kwargs.get('klass')
1709 if klass is None:
1710 klass = MOFile
1711 self.instance = klass(
1712 fpath=mofile,
1713 encoding=kwargs.get('encoding', default_encoding),
1714 check_for_duplicates=kwargs.get('check_for_duplicates', False)
1717 def __del__(self):
1719 Make sure the file is closed, this prevents warnings on unclosed file
1720 when running tests with python >= 3.2.
1722 if self.fhandle and hasattr(self.fhandle, 'close'):
1723 self.fhandle.close()
1725 def parse(self):
1727 Build the instance with the file handle provided in the
1728 constructor.
1730 # parse magic number
1731 magic_number = self._readbinary('<I', 4)
1732 if magic_number == MOFile.MAGIC:
1733 ii = '<II'
1734 elif magic_number == MOFile.MAGIC_SWAPPED:
1735 ii = '>II'
1736 else:
1737 raise IOError('Invalid mo file, magic number is incorrect !')
1738 self.instance.magic_number = magic_number
1739 # parse the version number and the number of strings
1740 version, numofstrings = self._readbinary(ii, 8)
1741 # from MO file format specs: "A program seeing an unexpected major
1742 # revision number should stop reading the MO file entirely"
1743 if version >> 16 not in (0, 1):
1744 raise IOError('Invalid mo file, unexpected major revision number')
1745 self.instance.version = version
1746 # original strings and translation strings hash table offset
1747 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1748 # move to msgid hash table and read length and offset of msgids
1749 self.fhandle.seek(msgids_hash_offset)
1750 msgids_index = []
1751 for i in range(numofstrings):
1752 msgids_index.append(self._readbinary(ii, 8))
1753 # move to msgstr hash table and read length and offset of msgstrs
1754 self.fhandle.seek(msgstrs_hash_offset)
1755 msgstrs_index = []
1756 for i in range(numofstrings):
1757 msgstrs_index.append(self._readbinary(ii, 8))
1758 # build entries
1759 encoding = self.instance.encoding
1760 for i in range(numofstrings):
1761 self.fhandle.seek(msgids_index[i][1])
1762 msgid = self.fhandle.read(msgids_index[i][0])
1764 self.fhandle.seek(msgstrs_index[i][1])
1765 msgstr = self.fhandle.read(msgstrs_index[i][0])
1766 if i == 0 and not msgid: # metadata
1767 raw_metadata, metadata = msgstr.split(b('\n')), {}
1768 for line in raw_metadata:
1769 tokens = line.split(b(':'), 1)
1770 if tokens[0] != b(''):
1771 try:
1772 k = tokens[0].decode(encoding)
1773 v = tokens[1].decode(encoding)
1774 metadata[k] = v.strip()
1775 except IndexError:
1776 metadata[k] = u('')
1777 self.instance.metadata = metadata
1778 continue
1779 # test if we have a plural entry
1780 msgid_tokens = msgid.split(b('\0'))
1781 if len(msgid_tokens) > 1:
1782 entry = self._build_entry(
1783 msgid=msgid_tokens[0],
1784 msgid_plural=msgid_tokens[1],
1785 msgstr_plural=dict((k, v) for k, v in
1786 enumerate(msgstr.split(b('\0'))))
1788 else:
1789 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1790 self.instance.append(entry)
1791 # close opened file
1792 self.fhandle.close()
1793 return self.instance
1795 def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
1796 msgstr_plural=None):
1797 msgctxt_msgid = msgid.split(b('\x04'))
1798 encoding = self.instance.encoding
1799 if len(msgctxt_msgid) > 1:
1800 kwargs = {
1801 'msgctxt': msgctxt_msgid[0].decode(encoding),
1802 'msgid': msgctxt_msgid[1].decode(encoding),
1804 else:
1805 kwargs = {'msgid': msgid.decode(encoding)}
1806 if msgstr:
1807 kwargs['msgstr'] = msgstr.decode(encoding)
1808 if msgid_plural:
1809 kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1810 if msgstr_plural:
1811 for k in msgstr_plural:
1812 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1813 kwargs['msgstr_plural'] = msgstr_plural
1814 return MOEntry(**kwargs)
1816 def _readbinary(self, fmt, numbytes):
1818 Private method that unpack n bytes of data using format <fmt>.
1819 It returns a tuple or a mixed value if the tuple length is 1.
1821 content = self.fhandle.read(numbytes)
1822 tup = struct.unpack(fmt, content)
1823 if len(tup) == 1:
1824 return tup[0]
1825 return tup
1826 # }}}