fetch: add ability to fetch into a remote tracking branch
[git-cola.git] / cola / polib.py
blobe851263dcc51698ea171a481fd353635cfbac1be
2 # License: MIT (see extras/polib/LICENSE file provided)
3 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
5 """
6 **polib** allows you to manipulate, create, modify gettext files (pot, po and
7 mo files). You can load existing files, iterate through it's entries, add,
8 modify entries, comments or metadata, etc. or create new po files from scratch.
10 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
11 :func:`~polib.mofile` convenience functions.
12 """
13 import array
14 import codecs
15 import os
16 import re
17 import struct
18 import sys
19 import textwrap
20 import io
22 from . import compat
25 __author__ = 'David Jean Louis <izimobil@gmail.com>'
26 __version__ = '1.1.1'
27 __all__ = [
28 'pofile',
29 'POFile',
30 'POEntry',
31 'mofile',
32 'MOFile',
33 'MOEntry',
34 'default_encoding',
35 'escape',
36 'unescape',
37 'detect_encoding',
41 # the default encoding to use when encoding cannot be detected
42 default_encoding = 'utf-8'
44 # python 2/3 compatibility helpers {{{
47 PY3 = True
48 text_type = str
51 def b(s):
52 return s.encode('utf-8')
55 def u(s):
56 return s
59 # }}}
60 # _pofile_or_mofile {{{
63 def _pofile_or_mofile(f, filetype, **kwargs):
64 """
65 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
66 honor the DRY concept.
67 """
68 # get the file encoding
69 enc = kwargs.get('encoding')
70 if enc is None:
71 enc = detect_encoding(f, filetype == 'mofile')
73 # parse the file
74 kls = _POFileParser if filetype == 'pofile' else _MOFileParser
75 parser = kls(
77 encoding=enc,
78 check_for_duplicates=kwargs.get('check_for_duplicates', False),
79 klass=kwargs.get('klass'),
81 instance = parser.parse()
82 instance.wrapwidth = kwargs.get('wrapwidth', 78)
83 return instance
86 # }}}
87 # _is_file {{{
90 def _is_file(filename_or_contents):
91 """
92 Safely returns the value of os.path.exists(filename_or_contents).
94 Arguments:
96 ``filename_or_contents``
97 either a filename, or a string holding the contents of some file.
98 In the latter case, this function will always return False.
99 """
100 try:
101 return os.path.isfile(filename_or_contents)
102 except (TypeError, ValueError, UnicodeEncodeError):
103 return False
106 # }}}
107 # function pofile() {{{
110 def pofile(pofile, **kwargs):
112 Convenience function that parses the po or pot file ``pofile`` and returns
113 a :class:`~polib.POFile` instance.
115 Arguments:
117 ``pofile``
118 string, full or relative path to the po/pot file or its content (data).
120 ``wrapwidth``
121 integer, the wrap width, only useful when the ``-w`` option was passed
122 to xgettext (optional, default: ``78``).
124 ``encoding``
125 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
126 encoding will be auto-detected).
128 ``check_for_duplicates``
129 whether to check for duplicate entries when adding entries to the
130 file (optional, default: ``False``).
132 ``klass``
133 class which is used to instantiate the return value (optional,
134 default: ``None``, the return value with be a :class:`~polib.POFile`
135 instance).
137 return _pofile_or_mofile(pofile, 'pofile', **kwargs)
140 # }}}
141 # function mofile() {{{
144 def mofile(mofile, **kwargs):
146 Convenience function that parses the mo file ``mofile`` and returns a
147 :class:`~polib.MOFile` instance.
149 Arguments:
151 ``mofile``
152 string, full or relative path to the mo file or its content (string
153 or bytes).
155 ``wrapwidth``
156 integer, the wrap width, only useful when the ``-w`` option was passed
157 to xgettext to generate the po file that was used to format the mo file
158 (optional, default: ``78``).
160 ``encoding``
161 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
162 encoding will be auto-detected).
164 ``check_for_duplicates``
165 whether to check for duplicate entries when adding entries to the
166 file (optional, default: ``False``).
168 ``klass``
169 class which is used to instantiate the return value (optional,
170 default: ``None``, the return value with be a :class:`~polib.POFile`
171 instance).
173 return _pofile_or_mofile(mofile, 'mofile', **kwargs)
176 # }}}
177 # function detect_encoding() {{{
180 def detect_encoding(file, binary_mode=False):
182 Try to detect the encoding used by the ``file``. The ``file`` argument can
183 be a PO or MO file path or a string containing the contents of the file.
184 If the encoding cannot be detected, the function will return the value of
185 ``default_encoding``.
187 Arguments:
189 ``file``
190 string, full or relative path to the po/mo file or its content.
192 ``binary_mode``
193 boolean, set this to True if ``file`` is a mo file.
195 PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
196 rxt = re.compile(u(PATTERN))
197 rxb = re.compile(b(PATTERN))
199 def charset_exists(charset):
200 """Check whether ``charset`` is valid or not."""
201 try:
202 codecs.lookup(charset)
203 except LookupError:
204 return False
205 return True
207 if not _is_file(file):
208 try:
209 match = rxt.search(file)
210 except TypeError:
211 match = rxb.search(file)
212 if match:
213 enc = match.group(1).strip()
214 if not isinstance(enc, text_type):
215 enc = enc.decode('utf-8')
216 if charset_exists(enc):
217 return enc
218 else:
219 # For PY3, always treat as binary
220 if binary_mode or PY3:
221 mode = 'rb'
222 rx = rxb
223 else:
224 mode = 'r'
225 rx = rxt
226 f = open(file, mode)
227 for line in f.readlines():
228 match = rx.search(line)
229 if match:
230 f.close()
231 enc = match.group(1).strip()
232 if not isinstance(enc, text_type):
233 enc = enc.decode('utf-8')
234 if charset_exists(enc):
235 return enc
236 f.close()
237 return default_encoding
240 # }}}
241 # function escape() {{{
244 def escape(st):
246 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
247 the given string ``st`` and returns it.
249 return (
250 st.replace('\\', r'\\')
251 .replace('\t', r'\t')
252 .replace('\r', r'\r')
253 .replace('\n', r'\n')
254 .replace('"', r'\"')
258 # }}}
259 # function unescape() {{{
262 def unescape(st):
264 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
265 the given string ``st`` and returns it.
268 def unescape_repl(m):
269 m = m.group(1)
270 if m == 'n':
271 return '\n'
272 if m == 't':
273 return '\t'
274 if m == 'r':
275 return '\r'
276 if m == '\\':
277 return '\\'
278 return m # handles escaped double quote
280 return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
283 # }}}
284 # function natural_sort() {{{
287 def natural_sort(lst):
289 Sort naturally the given list.
290 Credits: http://stackoverflow.com/a/4836734
293 def convert(text):
294 return int(text) if text.isdigit() else text.lower()
296 def alphanum_key(key):
297 return [convert(c) for c in re.split('([0-9]+)', key)]
299 return sorted(lst, key=alphanum_key)
302 # }}}
303 # class _BaseFile {{{
306 class _BaseFile(list):
308 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
309 classes. This class should **not** be instantiated directly.
312 def __init__(self, *_args, **kwargs):
314 Constructor, accepts the following keyword arguments:
316 ``pofile``
317 string, the path to the po or mo file, or its content as a string.
319 ``wrapwidth``
320 integer, the wrap width, only useful when the ``-w`` option was
321 passed to xgettext (optional, default: ``78``).
323 ``encoding``
324 string, the encoding to use, defaults to ``default_encoding``
325 global variable (optional).
327 ``check_for_duplicates``
328 whether to check for duplicate entries when adding entries to the
329 file, (optional, default: ``False``).
331 list.__init__(self)
332 # the opened file handle
333 pofile = kwargs.get('pofile', None)
334 if pofile and _is_file(pofile):
335 self.fpath = pofile
336 else:
337 self.fpath = kwargs.get('fpath')
338 # the width at which lines should be wrapped
339 self.wrapwidth = kwargs.get('wrapwidth', 78)
340 # the file encoding
341 self.encoding = kwargs.get('encoding', default_encoding)
342 # whether to check for duplicate entries or not
343 self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
344 # header
345 self.header = ''
346 # both po and mo files have metadata
347 self.metadata = {}
348 self.metadata_is_fuzzy = 0
350 def __unicode__(self):
352 Returns the unicode representation of the file.
354 ret = []
355 entries = [self.metadata_as_entry()] + [e for e in self if not e.obsolete]
356 for entry in entries:
357 ret.append(entry.__unicode__(self.wrapwidth))
358 for entry in self.obsolete_entries():
359 ret.append(entry.__unicode__(self.wrapwidth))
360 ret = u('\n').join(ret)
361 return ret
363 if PY3:
365 def __str__(self):
366 return self.__unicode__()
368 else:
370 def __str__(self):
372 Returns the string representation of the file.
374 return compat.ustr(self).encode(self.encoding)
376 def __contains__(self, entry):
378 Overridden ``list`` method to implement the membership test (in and
379 not in).
380 The method considers that an entry is in the file if it finds an entry
381 that has the same msgid (the test is **case sensitive**) and the same
382 msgctxt (or none for both entries).
384 Argument:
386 ``entry``
387 an instance of :class:`~polib._BaseEntry`.
389 return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) is not None
391 def __eq__(self, other):
392 return str(self) == str(other)
394 def __hash__(self):
395 return hash(str(self))
397 def append(self, entry):
399 Overridden method to check for duplicates entries, if a user tries to
400 add an entry that is already in the file, the method will raise a
401 ``ValueError`` exception.
403 Argument:
405 ``entry``
406 an instance of :class:`~polib._BaseEntry`.
408 # check_for_duplicates may not be defined (yet) when unpickling.
409 # But if pickling, we never want to check for duplicates anyway.
410 if getattr(self, 'check_for_duplicates', False) and entry in self:
411 raise ValueError('Entry "%s" already exists' % entry.msgid)
412 super().append(entry)
414 def insert(self, index, entry):
416 Overridden method to check for duplicates entries, if a user tries to
417 add an entry that is already in the file, the method will raise a
418 ``ValueError`` exception.
420 Arguments:
422 ``index``
423 index at which the entry should be inserted.
425 ``entry``
426 an instance of :class:`~polib._BaseEntry`.
428 if self.check_for_duplicates and entry in self:
429 raise ValueError('Entry "%s" already exists' % entry.msgid)
430 super().insert(index, entry)
432 def metadata_as_entry(self):
434 Returns the file metadata as a :class:`~polib.POFile` instance.
436 e = POEntry(msgid='')
437 mdata = self.ordered_metadata()
438 if mdata:
439 strs = []
440 for name, value in mdata:
441 # Strip whitespace off each line in a multi-line entry
442 strs.append(f'{name}: {value}')
443 e.msgstr = '\n'.join(strs) + '\n'
444 if self.metadata_is_fuzzy:
445 e.flags.append('fuzzy')
446 return e
448 def save(self, fpath=None, repr_method='__unicode__', newline=None):
450 Saves the po file to ``fpath``.
451 If it is an existing file and no ``fpath`` is provided, then the
452 existing file is rewritten with the modified data.
454 Keyword arguments:
456 ``fpath``
457 string, full or relative path to the file.
459 ``repr_method``
460 string, the method to use for output.
462 ``newline``
463 string, controls how universal newlines works
465 if self.fpath is None and fpath is None:
466 raise OSError('You must provide a file path to save() method')
467 contents = getattr(self, repr_method)()
468 if fpath is None:
469 fpath = self.fpath
470 if repr_method == 'to_binary':
471 fhandle = open(fpath, 'wb')
472 else:
473 fhandle = open(fpath, 'w', encoding=self.encoding, newline=newline)
474 if not isinstance(contents, text_type):
475 contents = contents.decode(self.encoding)
476 fhandle.write(contents)
477 fhandle.close()
478 # set the file path if not set
479 if self.fpath is None and fpath:
480 self.fpath = fpath
482 def find(self, st, by='msgid', include_obsolete_entries=False, msgctxt=False):
484 Find the entry which msgid (or property identified by the ``by``
485 argument) matches the string ``st``.
487 Keyword arguments:
489 ``st``
490 string, the string to search for.
492 ``by``
493 string, the property to use for comparison (default: ``msgid``).
495 ``include_obsolete_entries``
496 boolean, whether to also search in entries that are obsolete.
498 ``msgctxt``
499 string, allows specifying a specific message context for the
500 search.
502 if include_obsolete_entries:
503 entries = self[:]
504 else:
505 entries = [e for e in self if not e.obsolete]
506 matches = []
507 for e in entries:
508 if getattr(e, by) == st:
509 if msgctxt is not False and e.msgctxt != msgctxt:
510 continue
511 matches.append(e)
512 if len(matches) == 1:
513 return matches[0]
514 elif len(matches) > 1:
515 if not msgctxt:
516 # find the entry with no msgctx
517 e = None
518 for m in matches:
519 if not m.msgctxt:
520 e = m
521 if e:
522 return e
523 # fallback to the first entry found
524 return matches[0]
525 return None
527 def ordered_metadata(self):
529 Convenience method that returns an ordered version of the metadata
530 dictionary. The return value is list of tuples (metadata name,
531 metadata_value).
533 # copy the dict first
534 metadata = self.metadata.copy()
535 data_order = [
536 'Project-Id-Version',
537 'Report-Msgid-Bugs-To',
538 'POT-Creation-Date',
539 'PO-Revision-Date',
540 'Last-Translator',
541 'Language-Team',
542 'Language',
543 'MIME-Version',
544 'Content-Type',
545 'Content-Transfer-Encoding',
546 'Plural-Forms',
548 ordered_data = []
549 for data in data_order:
550 try:
551 value = metadata.pop(data)
552 ordered_data.append((data, value))
553 except KeyError:
554 pass
555 # the rest of the metadata will be alphabetically ordered since there
556 # are no specs for this AFAIK
557 for data in natural_sort(metadata.keys()):
558 value = metadata[data]
559 ordered_data.append((data, value))
560 return ordered_data
562 def to_binary(self):
564 Return the binary representation of the file.
566 offsets = []
567 entries = self.translated_entries()
569 # the keys are sorted in the .mo file
570 def cmp(_self, other):
571 # msgfmt compares entries with msgctxt if it exists
572 self_msgid = _self.msgctxt or _self.msgid
573 other_msgid = other.msgctxt or other.msgid
574 if self_msgid > other_msgid:
575 return 1
576 elif self_msgid < other_msgid:
577 return -1
578 else:
579 return 0
581 # add metadata entry
582 entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
583 mentry = self.metadata_as_entry()
584 entries = [mentry] + entries
585 entries_len = len(entries)
586 ids, strs = b(''), b('')
587 for e in entries:
588 # For each string, we need size and file offset. Each string is
589 # NUL terminated; the NUL does not count into the size.
590 msgid = b('')
591 if e.msgctxt:
592 # Contexts are stored by storing the concatenation of the
593 # context, a <EOT> byte, and the original string
594 msgid = self._encode(e.msgctxt + '\4')
595 if e.msgid_plural:
596 msgstr = []
597 for index in sorted(e.msgstr_plural.keys()):
598 msgstr.append(e.msgstr_plural[index])
599 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
600 msgstr = self._encode('\0'.join(msgstr))
601 else:
602 msgid += self._encode(e.msgid)
603 msgstr = self._encode(e.msgstr)
604 offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
605 ids += msgid + b('\0')
606 strs += msgstr + b('\0')
608 # The header is 7 32-bit unsigned integers.
609 keystart = 7 * 4 + 16 * entries_len
610 # and the values start after the keys
611 valuestart = keystart + len(ids)
612 koffsets = []
613 voffsets = []
614 # The string table first has the list of keys, then the list of values.
615 # Each entry has first the size of the string, then the file offset.
616 for o1, l1, o2, l2 in offsets:
617 koffsets += [l1, o1 + keystart]
618 voffsets += [l2, o2 + valuestart]
619 offsets = koffsets + voffsets
621 output = struct.pack(
622 'Iiiiiii',
623 # Magic number
624 MOFile.MAGIC,
625 # Version
627 # number of entries
628 entries_len,
629 # start of key index
630 7 * 4,
631 # start of value index
632 7 * 4 + entries_len * 8,
633 # size and offset of hash table, we don't use hash tables
635 keystart,
637 if PY3 and sys.version_info.minor > 1: # python 3.2 or newer
638 output += array.array('i', offsets).tobytes()
639 else:
640 output += array.array('i', offsets).tostring()
641 output += ids
642 output += strs
643 return output
645 def _encode(self, mixed):
647 Encodes the given ``mixed`` argument with the file encoding if and
648 only if it's an unicode string and returns the encoded string.
650 if isinstance(mixed, text_type):
651 mixed = mixed.encode(self.encoding)
652 return mixed
655 # }}}
656 # class POFile {{{
659 class POFile(_BaseFile):
661 Po (or Pot) file reader/writer.
662 This class inherits the :class:`~polib._BaseFile` class and, by extension,
663 the python ``list`` type.
666 def __unicode__(self):
668 Returns the unicode representation of the po file.
670 ret, headers = '', self.header.split('\n')
671 for header in headers:
672 if not header:
673 ret += '#\n'
674 elif header[:1] in [',', ':']:
675 ret += '#%s\n' % header
676 else:
677 ret += '# %s\n' % header
679 if not isinstance(ret, text_type):
680 ret = ret.decode(self.encoding)
682 return ret + _BaseFile.__unicode__(self)
684 def save_as_mofile(self, fpath):
686 Saves the binary representation of the file to given ``fpath``.
688 Keyword argument:
690 ``fpath``
691 string, full or relative path to the mo file.
693 _BaseFile.save(self, fpath, 'to_binary')
695 def percent_translated(self):
697 Convenience method that returns the percentage of translated
698 messages.
700 total = len([e for e in self if not e.obsolete])
701 if total == 0:
702 return 100
703 translated = len(self.translated_entries())
704 return int(translated * 100 / float(total))
706 def translated_entries(self):
708 Convenience method that returns the list of translated entries.
710 return [e for e in self if e.translated()]
712 def untranslated_entries(self):
714 Convenience method that returns the list of untranslated entries.
716 return [
717 e for e in self if not e.translated() and not e.obsolete and not e.fuzzy
720 def fuzzy_entries(self):
722 Convenience method that returns the list of fuzzy entries.
724 return [e for e in self if e.fuzzy and not e.obsolete]
726 def obsolete_entries(self):
728 Convenience method that returns the list of obsolete entries.
730 return [e for e in self if e.obsolete]
732 def merge(self, refpot):
734 Convenience method that merges the current pofile with the pot file
735 provided. It behaves exactly as the gettext msgmerge utility:
737 * comments of this file will be preserved, but extracted comments and
738 occurrences will be discarded;
739 * any translations or comments in the file will be discarded, however,
740 dot comments and file positions will be preserved;
741 * the fuzzy flags are preserved.
743 Keyword argument:
745 ``refpot``
746 object POFile, the reference catalog.
748 # Store entries in dict/set for faster access
749 self_entries = {entry.msgid_with_context: entry for entry in self}
750 refpot_msgids = {entry.msgid_with_context for entry in refpot}
751 # Merge entries that are in the refpot
752 for entry in refpot:
753 e = self_entries.get(entry.msgid_with_context)
754 if e is None:
755 e = POEntry()
756 self.append(e)
757 e.merge(entry)
758 # ok, now we must "obsolete" entries that are not in the refpot anymore
759 for entry in self:
760 if entry.msgid_with_context not in refpot_msgids:
761 entry.obsolete = True
764 # }}}
765 # class MOFile {{{
768 class MOFile(_BaseFile):
770 Mo file reader/writer.
771 This class inherits the :class:`~polib._BaseFile` class and, by
772 extension, the python ``list`` type.
775 MAGIC = 0x950412DE
776 MAGIC_SWAPPED = 0xDE120495
778 def __init__(self, *args, **kwargs):
780 Constructor, accepts all keywords arguments accepted by
781 :class:`~polib._BaseFile` class.
783 _BaseFile.__init__(self, *args, **kwargs)
784 self.magic_number = None
785 self.version = 0
787 def save_as_pofile(self, fpath):
789 Saves the mofile as a pofile to ``fpath``.
791 Keyword argument:
793 ``fpath``
794 string, full or relative path to the file.
796 _BaseFile.save(self, fpath)
798 def save(self, fpath=None):
800 Saves the mofile to ``fpath``.
802 Keyword argument:
804 ``fpath``
805 string, full or relative path to the file.
807 _BaseFile.save(self, fpath, 'to_binary')
809 def percent_translated(self):
811 Convenience method to keep the same interface with POFile instances.
813 return 100
815 def translated_entries(self):
817 Convenience method to keep the same interface with POFile instances.
819 return self
821 def untranslated_entries(self):
823 Convenience method to keep the same interface with POFile instances.
825 return []
827 def fuzzy_entries(self):
829 Convenience method to keep the same interface with POFile instances.
831 return []
833 def obsolete_entries(self):
835 Convenience method to keep the same interface with POFile instances.
837 return []
840 # }}}
841 # class _BaseEntry {{{
844 class _BaseEntry:
846 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
847 This class should **not** be instantiated directly.
850 def __init__(self, *_args, **kwargs):
852 Constructor, accepts the following keyword arguments:
854 ``msgid``
855 string, the entry msgid.
857 ``msgstr``
858 string, the entry msgstr.
860 ``msgid_plural``
861 string, the entry msgid_plural.
863 ``msgstr_plural``
864 dict, the entry msgstr_plural lines.
866 ``msgctxt``
867 string, the entry context (msgctxt).
869 ``obsolete``
870 bool, whether the entry is "obsolete" or not.
872 ``encoding``
873 string, the encoding to use, defaults to ``default_encoding``
874 global variable (optional).
876 self.msgid = kwargs.get('msgid', '')
877 self.msgstr = kwargs.get('msgstr', '')
878 self.msgid_plural = kwargs.get('msgid_plural', '')
879 self.msgstr_plural = kwargs.get('msgstr_plural', {})
880 self.msgctxt = kwargs.get('msgctxt', None)
881 self.obsolete = kwargs.get('obsolete', False)
882 self.encoding = kwargs.get('encoding', default_encoding)
884 def __unicode__(self, wrapwidth=78):
886 Returns the unicode representation of the entry.
888 if self.obsolete:
889 delflag = '#~ '
890 else:
891 delflag = ''
892 ret = []
893 # write the msgctxt if any
894 if self.msgctxt is not None:
895 ret += self._str_field('msgctxt', delflag, '', self.msgctxt, wrapwidth)
896 # write the msgid
897 ret += self._str_field('msgid', delflag, '', self.msgid, wrapwidth)
898 # write the msgid_plural if any
899 if self.msgid_plural:
900 ret += self._str_field(
901 'msgid_plural', delflag, '', self.msgid_plural, wrapwidth
903 if self.msgstr_plural:
904 # write the msgstr_plural if any
905 msgstrs = self.msgstr_plural
906 keys = list(msgstrs)
907 keys.sort()
908 for index in keys:
909 msgstr = msgstrs[index]
910 plural_index = '[%s]' % index
911 ret += self._str_field(
912 'msgstr', delflag, plural_index, msgstr, wrapwidth
914 else:
915 # otherwise write the msgstr
916 ret += self._str_field('msgstr', delflag, '', self.msgstr, wrapwidth)
917 ret.append('')
918 ret = u('\n').join(ret)
919 return ret
921 if PY3:
923 def __str__(self):
924 return self.__unicode__()
926 else:
928 def __str__(self):
930 Returns the string representation of the entry.
932 return compat.ustr(self).encode(self.encoding)
934 def __eq__(self, other):
935 return str(self) == str(other)
937 def __hash__(self):
938 return hash(str(self))
940 def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78):
941 lines = field.splitlines(True)
942 if len(lines) > 1:
943 lines = [''] + lines # start with initial empty line
944 else:
945 escaped_field = escape(field)
946 specialchars_count = 0
947 for c in ['\\', '\n', '\r', '\t', '"']:
948 specialchars_count += field.count(c)
949 # comparison must take into account fieldname length + one space
950 # + 2 quotes (eg. msgid "<string>")
951 flength = len(fieldname) + 3
952 if plural_index:
953 flength += len(plural_index)
954 real_wrapwidth = wrapwidth - flength + specialchars_count
955 if wrapwidth > 0 and len(field) > real_wrapwidth:
956 # Wrap the line but take field name into account
957 lines = [''] + [
958 unescape(item)
959 for item in textwrap.wrap(
960 escaped_field,
961 wrapwidth - 2, # 2 for quotes ""
962 drop_whitespace=False,
963 break_long_words=False,
966 else:
967 lines = [field]
968 if fieldname.startswith('previous_'):
969 # quick and dirty trick to get the real field name
970 fieldname = fieldname[9:]
972 ret = [f'{delflag}{fieldname}{plural_index} "{escape(lines.pop(0))}"']
973 for line in lines:
974 ret.append(f'{delflag}"{escape(line)}"')
975 return ret
977 @property
978 def msgid_with_context(self):
979 if self.msgctxt:
980 return '{}{}{}'.format(self.msgctxt, '\x04', self.msgid)
981 return self.msgid
984 # }}}
985 # class POEntry {{{
988 class POEntry(_BaseEntry):
990 Represents a po file entry.
993 def __init__(self, *args, **kwargs):
995 Constructor, accepts the following keyword arguments:
997 ``comment``
998 string, the entry comment.
1000 ``tcomment``
1001 string, the entry translator comment.
1003 ``occurrences``
1004 list, the entry occurrences.
1006 ``flags``
1007 list, the entry flags.
1009 ``previous_msgctxt``
1010 string, the entry previous context.
1012 ``previous_msgid``
1013 string, the entry previous msgid.
1015 ``previous_msgid_plural``
1016 string, the entry previous msgid_plural.
1018 ``linenum``
1019 integer, the line number of the entry
1021 _BaseEntry.__init__(self, *args, **kwargs)
1022 self.comment = kwargs.get('comment', '')
1023 self.tcomment = kwargs.get('tcomment', '')
1024 self.occurrences = kwargs.get('occurrences', [])
1025 self.flags = kwargs.get('flags', [])
1026 self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
1027 self.previous_msgid = kwargs.get('previous_msgid', None)
1028 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
1029 self.linenum = kwargs.get('linenum', None)
1031 def __unicode__(self, wrapwidth=78):
1033 Returns the unicode representation of the entry.
1035 ret = []
1036 # comments first, if any (with text wrapping as xgettext does)
1037 if self.obsolete:
1038 comments = [('tcomment', '# ')]
1039 else:
1040 comments = [('comment', '#. '), ('tcomment', '# ')]
1041 for c in comments:
1042 val = getattr(self, c[0])
1043 if val:
1044 for comment in val.split('\n'):
1045 if len(comment) + len(c[1]) > wrapwidth > 0:
1046 ret += textwrap.wrap(
1047 comment,
1048 wrapwidth,
1049 initial_indent=c[1],
1050 subsequent_indent=c[1],
1051 break_long_words=False,
1053 else:
1054 ret.append(f'{c[1]}{comment}')
1056 # occurrences (with text wrapping as xgettext does)
1057 if not self.obsolete and self.occurrences:
1058 filelist = []
1059 for fpath, lineno in self.occurrences:
1060 if lineno:
1061 filelist.append(f'{fpath}:{lineno}')
1062 else:
1063 filelist.append(fpath)
1064 filestr = ' '.join(filelist)
1065 if len(filestr) + 3 > wrapwidth > 0:
1066 # textwrap split words that contain hyphen, this is not
1067 # what we want for filenames, so the dirty hack is to
1068 # temporally replace hyphens with a char that a file cannot
1069 # contain, like "*"
1070 ret += [
1071 line.replace('*', '-')
1072 for line in textwrap.wrap(
1073 filestr.replace('-', '*'),
1074 wrapwidth,
1075 initial_indent='#: ',
1076 subsequent_indent='#: ',
1077 break_long_words=False,
1080 else:
1081 ret.append('#: ' + filestr)
1083 # flags (TODO: wrapping ?)
1084 if self.flags:
1085 ret.append('#, %s' % ', '.join(self.flags))
1087 # previous context and previous msgid/msgid_plural
1088 fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
1089 if self.obsolete:
1090 prefix = '#~| '
1091 else:
1092 prefix = '#| '
1093 for f in fields:
1094 val = getattr(self, f)
1095 if val is not None:
1096 ret += self._str_field(f, prefix, '', val, wrapwidth)
1098 ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1099 ret = u('\n').join(ret)
1100 return ret
1102 def __cmp__(self, other):
1104 Called by comparison operations if rich comparison is not defined.
1106 # First: Obsolete test
1107 if self.obsolete != other.obsolete:
1108 if self.obsolete:
1109 return -1
1110 else:
1111 return 1
1112 # Work on a copy to protect original
1113 occ1 = sorted(self.occurrences[:])
1114 occ2 = sorted(other.occurrences[:])
1115 if occ1 > occ2:
1116 return 1
1117 if occ1 < occ2:
1118 return -1
1119 # Compare context
1120 msgctxt = self.msgctxt or '0'
1121 othermsgctxt = other.msgctxt or '0'
1122 if msgctxt > othermsgctxt:
1123 return 1
1124 elif msgctxt < othermsgctxt:
1125 return -1
1126 # Compare msgid_plural
1127 msgid_plural = self.msgid_plural or '0'
1128 othermsgid_plural = other.msgid_plural or '0'
1129 if msgid_plural > othermsgid_plural:
1130 return 1
1131 elif msgid_plural < othermsgid_plural:
1132 return -1
1133 # Compare msgstr_plural
1134 if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1135 msgstr_plural = list(self.msgstr_plural.values())
1136 else:
1137 msgstr_plural = []
1138 if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1139 othermsgstr_plural = list(other.msgstr_plural.values())
1140 else:
1141 othermsgstr_plural = []
1142 if msgstr_plural > othermsgstr_plural:
1143 return 1
1144 elif msgstr_plural < othermsgstr_plural:
1145 return -1
1146 # Compare msgid
1147 if self.msgid > other.msgid:
1148 return 1
1149 elif self.msgid < other.msgid:
1150 return -1
1151 # Compare msgstr
1152 if self.msgstr > other.msgstr:
1153 return 1
1154 elif self.msgstr < other.msgstr:
1155 return -1
1156 return 0
1158 def __gt__(self, other):
1159 return self.__cmp__(other) > 0
1161 def __lt__(self, other):
1162 return self.__cmp__(other) < 0
1164 def __ge__(self, other):
1165 return self.__cmp__(other) >= 0
1167 def __le__(self, other):
1168 return self.__cmp__(other) <= 0
1170 def __eq__(self, other):
1171 return self.__cmp__(other) == 0
1173 def __ne__(self, other):
1174 return self.__cmp__(other) != 0
1176 def translated(self):
1178 Returns ``True`` if the entry has been translated or ``False``
1179 otherwise.
1181 if self.obsolete or self.fuzzy:
1182 return False
1183 if self.msgstr != '':
1184 return True
1185 if self.msgstr_plural:
1186 for pos in self.msgstr_plural:
1187 if self.msgstr_plural[pos] == '':
1188 return False
1189 return True
1190 return False
1192 def merge(self, other):
1194 Merge the current entry with the given pot entry.
1196 self.msgid = other.msgid
1197 self.msgctxt = other.msgctxt
1198 self.occurrences = other.occurrences
1199 self.comment = other.comment
1200 fuzzy = self.fuzzy
1201 self.flags = other.flags[:] # clone flags
1202 if fuzzy:
1203 self.flags.append('fuzzy')
1204 self.msgid_plural = other.msgid_plural
1205 self.obsolete = other.obsolete
1206 self.previous_msgctxt = other.previous_msgctxt
1207 self.previous_msgid = other.previous_msgid
1208 self.previous_msgid_plural = other.previous_msgid_plural
1209 if other.msgstr_plural:
1210 for pos in other.msgstr_plural:
1211 try:
1212 # keep existing translation at pos if any
1213 self.msgstr_plural[pos]
1214 except KeyError:
1215 self.msgstr_plural[pos] = ''
1217 @property
1218 def fuzzy(self):
1219 return 'fuzzy' in self.flags
1221 def __hash__(self):
1222 return hash((self.msgid, self.msgstr))
1225 # }}}
1226 # class MOEntry {{{
1229 class MOEntry(_BaseEntry):
1231 Represents a mo file entry.
1234 def __init__(self, *args, **kwargs):
1236 Constructor, accepts the following keyword arguments,
1237 for consistency with :class:`~polib.POEntry`:
1239 ``comment``
1240 ``tcomment``
1241 ``occurrences``
1242 ``flags``
1243 ``previous_msgctxt``
1244 ``previous_msgid``
1245 ``previous_msgid_plural``
1247 Note: even though these keyword arguments are accepted,
1248 they hold no real meaning in the context of MO files
1249 and are simply ignored.
1251 _BaseEntry.__init__(self, *args, **kwargs)
1252 self.comment = ''
1253 self.tcomment = ''
1254 self.occurrences = []
1255 self.flags = []
1256 self.previous_msgctxt = None
1257 self.previous_msgid = None
1258 self.previous_msgid_plural = None
1260 def __hash__(self):
1261 return hash((self.msgid, self.msgstr))
1264 # }}}
1265 # class _POFileParser {{{
1268 class _POFileParser:
1270 A finite state machine to parse efficiently and correctly po
1271 file format.
1274 def __init__(self, pofile, *_args, **kwargs):
1276 Constructor.
1278 Keyword arguments:
1280 ``pofile``
1281 string, path to the po file or its content
1283 ``encoding``
1284 string, the encoding to use, defaults to ``default_encoding``
1285 global variable (optional).
1287 ``check_for_duplicates``
1288 whether to check for duplicate entries when adding entries to the
1289 file (optional, default: ``False``).
1291 enc = kwargs.get('encoding', default_encoding)
1292 if _is_file(pofile):
1293 try:
1294 self.fhandle = open(pofile, encoding=enc)
1295 except LookupError:
1296 enc = default_encoding
1297 self.fhandle = open(pofile, encoding=enc)
1298 else:
1299 self.fhandle = pofile.splitlines()
1301 klass = kwargs.get('klass')
1302 if klass is None:
1303 klass = POFile
1304 self.instance = klass(
1305 pofile=pofile,
1306 encoding=enc,
1307 check_for_duplicates=kwargs.get('check_for_duplicates', False),
1309 self.transitions = {}
1310 self.current_line = 0
1311 self.current_entry = POEntry(linenum=self.current_line)
1312 self.current_state = 'st'
1313 self.current_token = None
1314 # two memo flags used in handlers
1315 self.msgstr_index = 0
1316 self.entry_obsolete = 0
1317 # Configure the state machine, by adding transitions.
1318 # Signification of symbols:
1319 # * ST: Beginning of the file (start)
1320 # * HE: Header
1321 # * TC: a translation comment
1322 # * GC: a generated comment
1323 # * OC: a file/line occurrence
1324 # * FL: a flags line
1325 # * CT: a message context
1326 # * PC: a previous msgctxt
1327 # * PM: a previous msgid
1328 # * PP: a previous msgid_plural
1329 # * MI: a msgid
1330 # * MP: a msgid plural
1331 # * MS: a msgstr
1332 # * MX: a msgstr plural
1333 # * MC: a msgid or msgstr continuation line
1334 all = [
1335 'st',
1336 'he',
1337 'gc',
1338 'oc',
1339 'fl',
1340 'ct',
1341 'pc',
1342 'pm',
1343 'pp',
1344 'tc',
1345 'ms',
1346 'mp',
1347 'mx',
1348 'mi',
1351 self.add('tc', ['st', 'he'], 'he')
1352 self.add(
1353 'tc',
1354 ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mp', 'mx', 'mi'],
1355 'tc',
1357 self.add('gc', all, 'gc')
1358 self.add('oc', all, 'oc')
1359 self.add('fl', all, 'fl')
1360 self.add('pc', all, 'pc')
1361 self.add('pm', all, 'pm')
1362 self.add('pp', all, 'pp')
1363 self.add(
1364 'ct',
1365 ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1366 'ct',
1368 self.add(
1369 'mi',
1370 ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1371 'mi',
1373 self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1374 self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1375 self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1376 self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1378 def parse(self):
1380 Run the state machine, parse the file line by line and call process()
1381 with the current matched symbol.
1384 keywords = {
1385 'msgctxt': 'ct',
1386 'msgid': 'mi',
1387 'msgstr': 'ms',
1388 'msgid_plural': 'mp',
1390 prev_keywords = {
1391 'msgid_plural': 'pp',
1392 'msgid': 'pm',
1393 'msgctxt': 'pc',
1395 tokens = []
1396 fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1397 for line in self.fhandle:
1398 self.current_line += 1
1399 if self.current_line == 1:
1400 BOM = codecs.BOM_UTF8.decode('utf-8')
1401 if line.startswith(BOM):
1402 line = line[len(BOM) :]
1403 line = line.strip()
1404 if line == '':
1405 continue
1407 tokens = line.split(None, 2)
1408 nb_tokens = len(tokens)
1410 if tokens[0] == '#~|':
1411 continue
1413 if tokens[0] == '#~' and nb_tokens > 1:
1414 line = line[3:].strip()
1415 tokens = tokens[1:]
1416 nb_tokens -= 1
1417 self.entry_obsolete = 1
1418 else:
1419 self.entry_obsolete = 0
1421 # Take care of keywords like
1422 # msgid, msgid_plural, msgctxt & msgstr.
1423 if tokens[0] in keywords and nb_tokens > 1:
1424 line = line[len(tokens[0]) :].lstrip()
1425 if re.search(r'([^\\]|^)"', line[1:-1]):
1426 raise OSError(
1427 'Syntax error in po file %s(line %s): '
1428 'unescaped double quote found' % (fpath, self.current_line)
1430 self.current_token = line
1431 self.process(keywords[tokens[0]])
1432 continue
1434 self.current_token = line
1436 if tokens[0] == '#:':
1437 if nb_tokens <= 1:
1438 continue
1439 # we are on a occurrences line
1440 self.process('oc')
1442 elif line[:1] == '"':
1443 # we are on a continuation line
1444 if re.search(r'([^\\]|^)"', line[1:-1]):
1445 raise OSError(
1446 'Syntax error in po file %s(line %s): '
1447 'unescaped double quote found' % (fpath, self.current_line)
1449 self.process('mc')
1451 elif line[:7] == 'msgstr[':
1452 # we are on a msgstr plural
1453 self.process('mx')
1455 elif tokens[0] == '#,':
1456 if nb_tokens <= 1:
1457 continue
1458 # we are on a flags line
1459 self.process('fl')
1461 elif tokens[0] == '#' or tokens[0].startswith('##'):
1462 if line == '#':
1463 line += ' '
1464 # we are on a translator comment line
1465 self.process('tc')
1467 elif tokens[0] == '#.':
1468 if nb_tokens <= 1:
1469 continue
1470 # we are on a generated comment line
1471 self.process('gc')
1473 elif tokens[0] == '#|':
1474 if nb_tokens <= 1:
1475 raise OSError(
1476 'Syntax error in po file %s(line %s)'
1477 % (fpath, self.current_line)
1480 # Remove the marker and any whitespace right after that.
1481 line = line[2:].lstrip()
1482 self.current_token = line
1484 if tokens[1].startswith('"'):
1485 # Continuation of previous metadata.
1486 self.process('mc')
1487 continue
1489 if nb_tokens == 2:
1490 # Invalid continuation line.
1491 raise OSError(
1492 'Syntax error in po file %s(line %s): '
1493 'invalid continuation line' % (fpath, self.current_line)
1496 # we are on a "previous translation" comment line,
1497 if tokens[1] not in prev_keywords:
1498 # Unknown keyword in previous translation comment.
1499 raise OSError(
1500 'Syntax error in po file %s(line %s): '
1501 'unknown keyword %s' % (fpath, self.current_line, tokens[1])
1504 # Remove the keyword and any whitespace
1505 # between it and the starting quote.
1506 line = line[len(tokens[1]) :].lstrip()
1507 self.current_token = line
1508 self.process(prev_keywords[tokens[1]])
1510 else:
1511 raise OSError(
1512 f'Syntax error in po file {fpath}(line {self.current_line})'
1515 if self.current_entry and len(tokens) > 0 and not tokens[0].startswith('#'):
1516 # since entries are added when another entry is found, we must add
1517 # the last entry here (only if there are lines). Trailing comments
1518 # are ignored
1519 self.instance.append(self.current_entry)
1521 # before returning the instance, check if there's metadata and if
1522 # so extract it in a dict
1523 metadataentry = self.instance.find('')
1524 if metadataentry: # metadata found
1525 # remove the entry
1526 self.instance.remove(metadataentry)
1527 self.instance.metadata_is_fuzzy = metadataentry.flags
1528 key = None
1529 for msg in metadataentry.msgstr.splitlines():
1530 try:
1531 key, val = msg.split(':', 1)
1532 self.instance.metadata[key] = val.strip()
1533 except (ValueError, KeyError):
1534 if key is not None:
1535 self.instance.metadata[key] += '\n' + msg.strip()
1536 # close opened file
1537 if not isinstance(self.fhandle, list): # must be file
1538 self.fhandle.close()
1539 return self.instance
1541 def add(self, symbol, states, next_state):
1543 Add a transition to the state machine.
1545 Keywords arguments:
1547 ``symbol``
1548 string, the matched token (two chars symbol).
1550 ``states``
1551 list, a list of states (two chars symbols).
1553 ``next_state``
1554 the next state the fsm will have after the action.
1556 for state in states:
1557 action = getattr(self, 'handle_%s' % next_state)
1558 self.transitions[(symbol, state)] = (action, next_state)
1560 def process(self, symbol):
1562 Process the transition corresponding to the current state and the
1563 symbol provided.
1565 Keywords arguments:
1567 ``symbol``
1568 string, the matched token (two chars symbol).
1570 ``linenum``
1571 integer, the current line number of the parsed file.
1573 try:
1574 (action, state) = self.transitions[(symbol, self.current_state)]
1575 if action():
1576 self.current_state = state
1577 except Exception:
1578 fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1579 if hasattr(self.fhandle, 'close'):
1580 self.fhandle.close()
1581 raise OSError(f'Syntax error in po file {fpath}(line {self.current_line})')
1583 # state handlers
1585 def handle_he(self):
1586 """Handle a header comment."""
1587 if self.instance.header != '':
1588 self.instance.header += '\n'
1589 self.instance.header += self.current_token[2:]
1590 return 1
1592 def handle_tc(self):
1593 """Handle a translator comment."""
1594 if self.current_state in ['mc', 'ms', 'mx']:
1595 self.instance.append(self.current_entry)
1596 self.current_entry = POEntry(linenum=self.current_line)
1597 if self.current_entry.tcomment != '':
1598 self.current_entry.tcomment += '\n'
1599 tcomment = self.current_token.lstrip('#')
1600 if tcomment.startswith(' '):
1601 tcomment = tcomment[1:]
1602 self.current_entry.tcomment += tcomment
1603 return True
1605 def handle_gc(self):
1606 """Handle a generated comment."""
1607 if self.current_state in ['mc', 'ms', 'mx']:
1608 self.instance.append(self.current_entry)
1609 self.current_entry = POEntry(linenum=self.current_line)
1610 if self.current_entry.comment != '':
1611 self.current_entry.comment += '\n'
1612 self.current_entry.comment += self.current_token[3:]
1613 return True
1615 def handle_oc(self):
1616 """Handle a file:num occurrence."""
1617 if self.current_state in ['mc', 'ms', 'mx']:
1618 self.instance.append(self.current_entry)
1619 self.current_entry = POEntry(linenum=self.current_line)
1620 occurrences = self.current_token[3:].split()
1621 for occurrence in occurrences:
1622 if occurrence != '':
1623 try:
1624 fil, line = occurrence.rsplit(':', 1)
1625 if not line.isdigit():
1626 fil = occurrence
1627 line = ''
1628 self.current_entry.occurrences.append((fil, line))
1629 except (ValueError, AttributeError):
1630 self.current_entry.occurrences.append((occurrence, ''))
1631 return True
1633 def handle_fl(self):
1634 """Handle a flags line."""
1635 if self.current_state in ['mc', 'ms', 'mx']:
1636 self.instance.append(self.current_entry)
1637 self.current_entry = POEntry(linenum=self.current_line)
1638 self.current_entry.flags += [
1639 c.strip() for c in self.current_token[3:].split(',')
1641 return True
1643 def handle_pp(self):
1644 """Handle a previous msgid_plural line."""
1645 if self.current_state in ['mc', 'ms', 'mx']:
1646 self.instance.append(self.current_entry)
1647 self.current_entry = POEntry(linenum=self.current_line)
1648 self.current_entry.previous_msgid_plural = unescape(self.current_token[1:-1])
1649 return True
1651 def handle_pm(self):
1652 """Handle a previous msgid line."""
1653 if self.current_state in ['mc', 'ms', 'mx']:
1654 self.instance.append(self.current_entry)
1655 self.current_entry = POEntry(linenum=self.current_line)
1656 self.current_entry.previous_msgid = unescape(self.current_token[1:-1])
1657 return True
1659 def handle_pc(self):
1660 """Handle a previous msgctxt line."""
1661 if self.current_state in ['mc', 'ms', 'mx']:
1662 self.instance.append(self.current_entry)
1663 self.current_entry = POEntry(linenum=self.current_line)
1664 self.current_entry.previous_msgctxt = unescape(self.current_token[1:-1])
1665 return True
1667 def handle_ct(self):
1668 """Handle a msgctxt."""
1669 if self.current_state in ['mc', 'ms', 'mx']:
1670 self.instance.append(self.current_entry)
1671 self.current_entry = POEntry(linenum=self.current_line)
1672 self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1673 return True
1675 def handle_mi(self):
1676 """Handle a msgid."""
1677 if self.current_state in ['mc', 'ms', 'mx']:
1678 self.instance.append(self.current_entry)
1679 self.current_entry = POEntry(linenum=self.current_line)
1680 self.current_entry.obsolete = self.entry_obsolete
1681 self.current_entry.msgid = unescape(self.current_token[1:-1])
1682 return True
1684 def handle_mp(self):
1685 """Handle a msgid plural."""
1686 self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1687 return True
1689 def handle_ms(self):
1690 """Handle a msgstr."""
1691 self.current_entry.msgstr = unescape(self.current_token[1:-1])
1692 return True
1694 def handle_mx(self):
1695 """Handle a msgstr plural."""
1696 index = self.current_token[7]
1697 value = self.current_token[self.current_token.find('"') + 1 : -1]
1698 self.current_entry.msgstr_plural[int(index)] = unescape(value)
1699 self.msgstr_index = int(index)
1700 return True
1702 def handle_mc(self):
1703 """Handle a msgid or msgstr continuation line."""
1704 token = unescape(self.current_token[1:-1])
1705 if self.current_state == 'ct':
1706 self.current_entry.msgctxt += token
1707 elif self.current_state == 'mi':
1708 self.current_entry.msgid += token
1709 elif self.current_state == 'mp':
1710 self.current_entry.msgid_plural += token
1711 elif self.current_state == 'ms':
1712 self.current_entry.msgstr += token
1713 elif self.current_state == 'mx':
1714 self.current_entry.msgstr_plural[self.msgstr_index] += token
1715 elif self.current_state == 'pp':
1716 self.current_entry.previous_msgid_plural += token
1717 elif self.current_state == 'pm':
1718 self.current_entry.previous_msgid += token
1719 elif self.current_state == 'pc':
1720 self.current_entry.previous_msgctxt += token
1721 # don't change the current state
1722 return False
1725 # }}}
1726 # class _MOFileParser {{{
1729 class _MOFileParser:
1731 A class to parse binary mo files.
1734 def __init__(self, mofile, *_args, **kwargs):
1736 Constructor.
1738 Keyword arguments:
1740 ``mofile``
1741 string, path to the mo file or its content
1743 ``encoding``
1744 string, the encoding to use, defaults to ``default_encoding``
1745 global variable (optional).
1747 ``check_for_duplicates``
1748 whether to check for duplicate entries when adding entries to the
1749 file (optional, default: ``False``).
1751 if _is_file(mofile):
1752 self.fhandle = open(mofile, 'rb')
1753 else:
1754 self.fhandle = io.BytesIO(mofile)
1756 klass = kwargs.get('klass')
1757 if klass is None:
1758 klass = MOFile
1759 self.instance = klass(
1760 fpath=mofile,
1761 encoding=kwargs.get('encoding', default_encoding),
1762 check_for_duplicates=kwargs.get('check_for_duplicates', False),
1765 def __del__(self):
1767 Make sure the file is closed, this prevents warnings on unclosed file
1768 when running tests with python >= 3.2.
1770 if self.fhandle and hasattr(self.fhandle, 'close'):
1771 self.fhandle.close()
1773 def parse(self):
1775 Build the instance with the file handle provided in the
1776 constructor.
1778 # parse magic number
1779 magic_number = self._readbinary('<I', 4)
1780 if magic_number == MOFile.MAGIC:
1781 ii = '<II'
1782 elif magic_number == MOFile.MAGIC_SWAPPED:
1783 ii = '>II'
1784 else:
1785 raise OSError('Invalid mo file, magic number is incorrect !')
1786 self.instance.magic_number = magic_number
1787 # parse the version number and the number of strings
1788 version, numofstrings = self._readbinary(ii, 8)
1789 # from MO file format specs: "A program seeing an unexpected major
1790 # revision number should stop reading the MO file entirely"
1791 if version >> 16 not in (0, 1):
1792 raise OSError('Invalid mo file, unexpected major revision number')
1793 self.instance.version = version
1794 # original strings and translation strings hash table offset
1795 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1796 # move to msgid hash table and read length and offset of msgids
1797 self.fhandle.seek(msgids_hash_offset)
1798 msgids_index = []
1799 for i in range(numofstrings):
1800 msgids_index.append(self._readbinary(ii, 8))
1801 # move to msgstr hash table and read length and offset of msgstrs
1802 self.fhandle.seek(msgstrs_hash_offset)
1803 msgstrs_index = []
1804 for i in range(numofstrings):
1805 msgstrs_index.append(self._readbinary(ii, 8))
1806 # build entries
1807 encoding = self.instance.encoding
1808 for i in range(numofstrings):
1809 self.fhandle.seek(msgids_index[i][1])
1810 msgid = self.fhandle.read(msgids_index[i][0])
1812 self.fhandle.seek(msgstrs_index[i][1])
1813 msgstr = self.fhandle.read(msgstrs_index[i][0])
1814 if i == 0 and not msgid: # metadata
1815 raw_metadata, metadata = msgstr.split(b('\n')), {}
1816 for line in raw_metadata:
1817 tokens = line.split(b(':'), 1)
1818 if tokens[0] != b(''):
1819 try:
1820 k = tokens[0].decode(encoding)
1821 v = tokens[1].decode(encoding)
1822 metadata[k] = v.strip()
1823 except IndexError:
1824 metadata[k] = u('')
1825 self.instance.metadata = metadata
1826 continue
1827 # test if we have a plural entry
1828 msgid_tokens = msgid.split(b('\0'))
1829 if len(msgid_tokens) > 1:
1830 entry = self._build_entry(
1831 msgid=msgid_tokens[0],
1832 msgid_plural=msgid_tokens[1],
1833 msgstr_plural=dict(enumerate(msgstr.split(b('\x00')))),
1835 else:
1836 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1837 self.instance.append(entry)
1838 # close opened file
1839 self.fhandle.close()
1840 return self.instance
1842 def _build_entry(self, msgid, msgstr=None, msgid_plural=None, msgstr_plural=None):
1843 msgctxt_msgid = msgid.split(b('\x04'))
1844 encoding = self.instance.encoding
1845 if len(msgctxt_msgid) > 1:
1846 kwargs = {
1847 'msgctxt': msgctxt_msgid[0].decode(encoding),
1848 'msgid': msgctxt_msgid[1].decode(encoding),
1850 else:
1851 kwargs = {'msgid': msgid.decode(encoding)}
1852 if msgstr:
1853 kwargs['msgstr'] = msgstr.decode(encoding)
1854 if msgid_plural:
1855 kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1856 if msgstr_plural:
1857 for k in msgstr_plural:
1858 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1859 kwargs['msgstr_plural'] = msgstr_plural
1860 return MOEntry(**kwargs)
1862 def _readbinary(self, fmt, numbytes):
1864 Private method that unpack n bytes of data using format <fmt>.
1865 It returns a tuple or a mixed value if the tuple length is 1.
1867 content = self.fhandle.read(numbytes)
1868 tup = struct.unpack(fmt, content)
1869 if len(tup) == 1:
1870 return tup[0]
1871 return tup
1874 # }}}