cola: vendor polib.py
[git-cola.git] / cola / polib.py
blobe6de20e2d1e3cc11f84608947c5935d025b8ca7a
1 # -* coding: utf-8 -*-
3 # License: MIT (see LICENSE file provided)
4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
6 """
7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
8 mo files). You can load existing files, iterate through it's entries, add,
9 modify entries, comments or metadata, etc. or create new po files from scratch.
11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
12 :func:`~polib.mofile` convenience functions.
13 """
15 import array
16 import codecs
17 import os
18 import re
19 import struct
20 import sys
21 import textwrap
22 import io
25 __author__ = 'David Jean Louis <izimobil@gmail.com>'
26 __version__ = '1.1.1'
27 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
28 'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
31 # the default encoding to use when encoding cannot be detected
32 default_encoding = 'utf-8'
34 # python 2/3 compatibility helpers {{{
37 if sys.version_info < (3,):
38 PY3 = False
39 text_type = unicode
41 def b(s):
42 return s
44 def u(s):
45 return unicode(s, "unicode_escape")
47 else:
48 PY3 = True
49 text_type = str
51 def b(s):
52 return s.encode("latin-1")
54 def u(s):
55 return s
56 # }}}
57 # _pofile_or_mofile {{{
60 def _pofile_or_mofile(f, type, **kwargs):
61 """
62 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
63 honor the DRY concept.
64 """
65 # get the file encoding
66 enc = kwargs.get('encoding')
67 if enc is None:
68 enc = detect_encoding(f, type == 'mofile')
70 # parse the file
71 kls = type == 'pofile' and _POFileParser or _MOFileParser
72 parser = kls(
74 encoding=enc,
75 check_for_duplicates=kwargs.get('check_for_duplicates', False),
76 klass=kwargs.get('klass')
78 instance = parser.parse()
79 instance.wrapwidth = kwargs.get('wrapwidth', 78)
80 return instance
81 # }}}
82 # _is_file {{{
85 def _is_file(filename_or_contents):
86 """
87 Safely returns the value of os.path.exists(filename_or_contents).
89 Arguments:
91 ``filename_or_contents``
92 either a filename, or a string holding the contents of some file.
93 In the latter case, this function will always return False.
94 """
95 try:
96 return os.path.isfile(filename_or_contents)
97 except (TypeError, ValueError, UnicodeEncodeError):
98 return False
99 # }}}
100 # function pofile() {{{
103 def pofile(pofile, **kwargs):
105 Convenience function that parses the po or pot file ``pofile`` and returns
106 a :class:`~polib.POFile` instance.
108 Arguments:
110 ``pofile``
111 string, full or relative path to the po/pot file or its content (data).
113 ``wrapwidth``
114 integer, the wrap width, only useful when the ``-w`` option was passed
115 to xgettext (optional, default: ``78``).
117 ``encoding``
118 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
119 encoding will be auto-detected).
121 ``check_for_duplicates``
122 whether to check for duplicate entries when adding entries to the
123 file (optional, default: ``False``).
125 ``klass``
126 class which is used to instantiate the return value (optional,
127 default: ``None``, the return value with be a :class:`~polib.POFile`
128 instance).
130 return _pofile_or_mofile(pofile, 'pofile', **kwargs)
131 # }}}
132 # function mofile() {{{
135 def mofile(mofile, **kwargs):
137 Convenience function that parses the mo file ``mofile`` and returns a
138 :class:`~polib.MOFile` instance.
140 Arguments:
142 ``mofile``
143 string, full or relative path to the mo file or its content (string
144 or bytes).
146 ``wrapwidth``
147 integer, the wrap width, only useful when the ``-w`` option was passed
148 to xgettext to generate the po file that was used to format the mo file
149 (optional, default: ``78``).
151 ``encoding``
152 string, the encoding to use (e.g. "utf-8") (default: ``None``, the
153 encoding will be auto-detected).
155 ``check_for_duplicates``
156 whether to check for duplicate entries when adding entries to the
157 file (optional, default: ``False``).
159 ``klass``
160 class which is used to instantiate the return value (optional,
161 default: ``None``, the return value with be a :class:`~polib.POFile`
162 instance).
164 return _pofile_or_mofile(mofile, 'mofile', **kwargs)
165 # }}}
166 # function detect_encoding() {{{
169 def detect_encoding(file, binary_mode=False):
171 Try to detect the encoding used by the ``file``. The ``file`` argument can
172 be a PO or MO file path or a string containing the contents of the file.
173 If the encoding cannot be detected, the function will return the value of
174 ``default_encoding``.
176 Arguments:
178 ``file``
179 string, full or relative path to the po/mo file or its content.
181 ``binary_mode``
182 boolean, set this to True if ``file`` is a mo file.
184 PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
185 rxt = re.compile(u(PATTERN))
186 rxb = re.compile(b(PATTERN))
188 def charset_exists(charset):
189 """Check whether ``charset`` is valid or not."""
190 try:
191 codecs.lookup(charset)
192 except LookupError:
193 return False
194 return True
196 if not _is_file(file):
197 try:
198 match = rxt.search(file)
199 except TypeError:
200 match = rxb.search(file)
201 if match:
202 enc = match.group(1).strip()
203 if not isinstance(enc, text_type):
204 enc = enc.decode('utf-8')
205 if charset_exists(enc):
206 return enc
207 else:
208 # For PY3, always treat as binary
209 if binary_mode or PY3:
210 mode = 'rb'
211 rx = rxb
212 else:
213 mode = 'r'
214 rx = rxt
215 f = open(file, mode)
216 for line in f.readlines():
217 match = rx.search(line)
218 if match:
219 f.close()
220 enc = match.group(1).strip()
221 if not isinstance(enc, text_type):
222 enc = enc.decode('utf-8')
223 if charset_exists(enc):
224 return enc
225 f.close()
226 return default_encoding
227 # }}}
228 # function escape() {{{
231 def escape(st):
233 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
234 the given string ``st`` and returns it.
236 return st.replace('\\', r'\\')\
237 .replace('\t', r'\t')\
238 .replace('\r', r'\r')\
239 .replace('\n', r'\n')\
240 .replace('\"', r'\"')
241 # }}}
242 # function unescape() {{{
245 def unescape(st):
247 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
248 the given string ``st`` and returns it.
250 def unescape_repl(m):
251 m = m.group(1)
252 if m == 'n':
253 return '\n'
254 if m == 't':
255 return '\t'
256 if m == 'r':
257 return '\r'
258 if m == '\\':
259 return '\\'
260 return m # handles escaped double quote
261 return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
262 # }}}
263 # function natural_sort() {{{
266 def natural_sort(lst):
268 Sort naturally the given list.
269 Credits: http://stackoverflow.com/a/4836734
271 def convert(text):
272 return int(text) if text.isdigit() else text.lower()
274 def alphanum_key(key):
275 return [convert(c) for c in re.split('([0-9]+)', key)]
277 return sorted(lst, key=alphanum_key)
279 # }}}
280 # class _BaseFile {{{
283 class _BaseFile(list):
285 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
286 classes. This class should **not** be instantiated directly.
289 def __init__(self, *args, **kwargs):
291 Constructor, accepts the following keyword arguments:
293 ``pofile``
294 string, the path to the po or mo file, or its content as a string.
296 ``wrapwidth``
297 integer, the wrap width, only useful when the ``-w`` option was
298 passed to xgettext (optional, default: ``78``).
300 ``encoding``
301 string, the encoding to use, defaults to ``default_encoding``
302 global variable (optional).
304 ``check_for_duplicates``
305 whether to check for duplicate entries when adding entries to the
306 file, (optional, default: ``False``).
308 list.__init__(self)
309 # the opened file handle
310 pofile = kwargs.get('pofile', None)
311 if pofile and _is_file(pofile):
312 self.fpath = pofile
313 else:
314 self.fpath = kwargs.get('fpath')
315 # the width at which lines should be wrapped
316 self.wrapwidth = kwargs.get('wrapwidth', 78)
317 # the file encoding
318 self.encoding = kwargs.get('encoding', default_encoding)
319 # whether to check for duplicate entries or not
320 self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
321 # header
322 self.header = ''
323 # both po and mo files have metadata
324 self.metadata = {}
325 self.metadata_is_fuzzy = 0
327 def __unicode__(self):
329 Returns the unicode representation of the file.
331 ret = []
332 entries = [self.metadata_as_entry()] + \
333 [e for e in self if not e.obsolete]
334 for entry in entries:
335 ret.append(entry.__unicode__(self.wrapwidth))
336 for entry in self.obsolete_entries():
337 ret.append(entry.__unicode__(self.wrapwidth))
338 ret = u('\n').join(ret)
339 return ret
341 if PY3:
342 def __str__(self):
343 return self.__unicode__()
344 else:
345 def __str__(self):
347 Returns the string representation of the file.
349 return unicode(self).encode(self.encoding)
351 def __contains__(self, entry):
353 Overridden ``list`` method to implement the membership test (in and
354 not in).
355 The method considers that an entry is in the file if it finds an entry
356 that has the same msgid (the test is **case sensitive**) and the same
357 msgctxt (or none for both entries).
359 Argument:
361 ``entry``
362 an instance of :class:`~polib._BaseEntry`.
364 return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
365 is not None
367 def __eq__(self, other):
368 return str(self) == str(other)
370 def append(self, entry):
372 Overridden method to check for duplicates entries, if a user tries to
373 add an entry that is already in the file, the method will raise a
374 ``ValueError`` exception.
376 Argument:
378 ``entry``
379 an instance of :class:`~polib._BaseEntry`.
381 # check_for_duplicates may not be defined (yet) when unpickling.
382 # But if pickling, we never want to check for duplicates anyway.
383 if getattr(self, 'check_for_duplicates', False) and entry in self:
384 raise ValueError('Entry "%s" already exists' % entry.msgid)
385 super(_BaseFile, self).append(entry)
387 def insert(self, index, entry):
389 Overridden method to check for duplicates entries, if a user tries to
390 add an entry that is already in the file, the method will raise a
391 ``ValueError`` exception.
393 Arguments:
395 ``index``
396 index at which the entry should be inserted.
398 ``entry``
399 an instance of :class:`~polib._BaseEntry`.
401 if self.check_for_duplicates and entry in self:
402 raise ValueError('Entry "%s" already exists' % entry.msgid)
403 super(_BaseFile, self).insert(index, entry)
405 def metadata_as_entry(self):
407 Returns the file metadata as a :class:`~polib.POFile` instance.
409 e = POEntry(msgid='')
410 mdata = self.ordered_metadata()
411 if mdata:
412 strs = []
413 for name, value in mdata:
414 # Strip whitespace off each line in a multi-line entry
415 strs.append('%s: %s' % (name, value))
416 e.msgstr = '\n'.join(strs) + '\n'
417 if self.metadata_is_fuzzy:
418 e.flags.append('fuzzy')
419 return e
421 def save(self, fpath=None, repr_method='__unicode__', newline=None):
423 Saves the po file to ``fpath``.
424 If it is an existing file and no ``fpath`` is provided, then the
425 existing file is rewritten with the modified data.
427 Keyword arguments:
429 ``fpath``
430 string, full or relative path to the file.
432 ``repr_method``
433 string, the method to use for output.
435 ``newline``
436 string, controls how universal newlines works
438 if self.fpath is None and fpath is None:
439 raise IOError('You must provide a file path to save() method')
440 contents = getattr(self, repr_method)()
441 if fpath is None:
442 fpath = self.fpath
443 if repr_method == 'to_binary':
444 fhandle = open(fpath, 'wb')
445 else:
446 fhandle = io.open(
447 fpath,
448 'w',
449 encoding=self.encoding,
450 newline=newline
452 if not isinstance(contents, text_type):
453 contents = contents.decode(self.encoding)
454 fhandle.write(contents)
455 fhandle.close()
456 # set the file path if not set
457 if self.fpath is None and fpath:
458 self.fpath = fpath
460 def find(self, st, by='msgid', include_obsolete_entries=False,
461 msgctxt=False):
463 Find the entry which msgid (or property identified by the ``by``
464 argument) matches the string ``st``.
466 Keyword arguments:
468 ``st``
469 string, the string to search for.
471 ``by``
472 string, the property to use for comparison (default: ``msgid``).
474 ``include_obsolete_entries``
475 boolean, whether to also search in entries that are obsolete.
477 ``msgctxt``
478 string, allows specifying a specific message context for the
479 search.
481 if include_obsolete_entries:
482 entries = self[:]
483 else:
484 entries = [e for e in self if not e.obsolete]
485 matches = []
486 for e in entries:
487 if getattr(e, by) == st:
488 if msgctxt is not False and e.msgctxt != msgctxt:
489 continue
490 matches.append(e)
491 if len(matches) == 1:
492 return matches[0]
493 elif len(matches) > 1:
494 if not msgctxt:
495 # find the entry with no msgctx
496 e = None
497 for m in matches:
498 if not m.msgctxt:
499 e = m
500 if e:
501 return e
502 # fallback to the first entry found
503 return matches[0]
504 return None
506 def ordered_metadata(self):
508 Convenience method that returns an ordered version of the metadata
509 dictionary. The return value is list of tuples (metadata name,
510 metadata_value).
512 # copy the dict first
513 metadata = self.metadata.copy()
514 data_order = [
515 'Project-Id-Version',
516 'Report-Msgid-Bugs-To',
517 'POT-Creation-Date',
518 'PO-Revision-Date',
519 'Last-Translator',
520 'Language-Team',
521 'Language',
522 'MIME-Version',
523 'Content-Type',
524 'Content-Transfer-Encoding',
525 'Plural-Forms'
527 ordered_data = []
528 for data in data_order:
529 try:
530 value = metadata.pop(data)
531 ordered_data.append((data, value))
532 except KeyError:
533 pass
534 # the rest of the metadata will be alphabetically ordered since there
535 # are no specs for this AFAIK
536 for data in natural_sort(metadata.keys()):
537 value = metadata[data]
538 ordered_data.append((data, value))
539 return ordered_data
541 def to_binary(self):
543 Return the binary representation of the file.
545 offsets = []
546 entries = self.translated_entries()
548 # the keys are sorted in the .mo file
549 def cmp(_self, other):
550 # msgfmt compares entries with msgctxt if it exists
551 self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid
552 other_msgid = other.msgctxt and other.msgctxt or other.msgid
553 if self_msgid > other_msgid:
554 return 1
555 elif self_msgid < other_msgid:
556 return -1
557 else:
558 return 0
559 # add metadata entry
560 entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
561 mentry = self.metadata_as_entry()
562 entries = [mentry] + entries
563 entries_len = len(entries)
564 ids, strs = b(''), b('')
565 for e in entries:
566 # For each string, we need size and file offset. Each string is
567 # NUL terminated; the NUL does not count into the size.
568 msgid = b('')
569 if e.msgctxt:
570 # Contexts are stored by storing the concatenation of the
571 # context, a <EOT> byte, and the original string
572 msgid = self._encode(e.msgctxt + '\4')
573 if e.msgid_plural:
574 msgstr = []
575 for index in sorted(e.msgstr_plural.keys()):
576 msgstr.append(e.msgstr_plural[index])
577 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
578 msgstr = self._encode('\0'.join(msgstr))
579 else:
580 msgid += self._encode(e.msgid)
581 msgstr = self._encode(e.msgstr)
582 offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
583 ids += msgid + b('\0')
584 strs += msgstr + b('\0')
586 # The header is 7 32-bit unsigned integers.
587 keystart = 7 * 4 + 16 * entries_len
588 # and the values start after the keys
589 valuestart = keystart + len(ids)
590 koffsets = []
591 voffsets = []
592 # The string table first has the list of keys, then the list of values.
593 # Each entry has first the size of the string, then the file offset.
594 for o1, l1, o2, l2 in offsets:
595 koffsets += [l1, o1 + keystart]
596 voffsets += [l2, o2 + valuestart]
597 offsets = koffsets + voffsets
599 output = struct.pack(
600 "Iiiiiii",
601 # Magic number
602 MOFile.MAGIC,
603 # Version
605 # number of entries
606 entries_len,
607 # start of key index
608 7 * 4,
609 # start of value index
610 7 * 4 + entries_len * 8,
611 # size and offset of hash table, we don't use hash tables
612 0, keystart
615 if PY3 and sys.version_info.minor > 1: # python 3.2 or superior
616 output += array.array("i", offsets).tobytes()
617 else:
618 output += array.array("i", offsets).tostring()
619 output += ids
620 output += strs
621 return output
623 def _encode(self, mixed):
625 Encodes the given ``mixed`` argument with the file encoding if and
626 only if it's an unicode string and returns the encoded string.
628 if isinstance(mixed, text_type):
629 mixed = mixed.encode(self.encoding)
630 return mixed
631 # }}}
632 # class POFile {{{
635 class POFile(_BaseFile):
637 Po (or Pot) file reader/writer.
638 This class inherits the :class:`~polib._BaseFile` class and, by extension,
639 the python ``list`` type.
642 def __unicode__(self):
644 Returns the unicode representation of the po file.
646 ret, headers = '', self.header.split('\n')
647 for header in headers:
648 if not len(header):
649 ret += "#\n"
650 elif header[:1] in [',', ':']:
651 ret += '#%s\n' % header
652 else:
653 ret += '# %s\n' % header
655 if not isinstance(ret, text_type):
656 ret = ret.decode(self.encoding)
658 return ret + _BaseFile.__unicode__(self)
660 def save_as_mofile(self, fpath):
662 Saves the binary representation of the file to given ``fpath``.
664 Keyword argument:
666 ``fpath``
667 string, full or relative path to the mo file.
669 _BaseFile.save(self, fpath, 'to_binary')
671 def percent_translated(self):
673 Convenience method that returns the percentage of translated
674 messages.
676 total = len([e for e in self if not e.obsolete])
677 if total == 0:
678 return 100
679 translated = len(self.translated_entries())
680 return int(translated * 100 / float(total))
682 def translated_entries(self):
684 Convenience method that returns the list of translated entries.
686 return [e for e in self if e.translated()]
688 def untranslated_entries(self):
690 Convenience method that returns the list of untranslated entries.
692 return [e for e in self if not e.translated() and not e.obsolete
693 and not e.fuzzy]
695 def fuzzy_entries(self):
697 Convenience method that returns the list of fuzzy entries.
699 return [e for e in self if e.fuzzy and not e.obsolete]
701 def obsolete_entries(self):
703 Convenience method that returns the list of obsolete entries.
705 return [e for e in self if e.obsolete]
707 def merge(self, refpot):
709 Convenience method that merges the current pofile with the pot file
710 provided. It behaves exactly as the gettext msgmerge utility:
712 * comments of this file will be preserved, but extracted comments and
713 occurrences will be discarded;
714 * any translations or comments in the file will be discarded, however,
715 dot comments and file positions will be preserved;
716 * the fuzzy flags are preserved.
718 Keyword argument:
720 ``refpot``
721 object POFile, the reference catalog.
723 # Store entries in dict/set for faster access
724 self_entries = dict(
725 (entry.msgid_with_context, entry) for entry in self
727 refpot_msgids = set(entry.msgid_with_context for entry in refpot)
728 # Merge entries that are in the refpot
729 for entry in refpot:
730 e = self_entries.get(entry.msgid_with_context)
731 if e is None:
732 e = POEntry()
733 self.append(e)
734 e.merge(entry)
735 # ok, now we must "obsolete" entries that are not in the refpot anymore
736 for entry in self:
737 if entry.msgid_with_context not in refpot_msgids:
738 entry.obsolete = True
739 # }}}
740 # class MOFile {{{
743 class MOFile(_BaseFile):
745 Mo file reader/writer.
746 This class inherits the :class:`~polib._BaseFile` class and, by
747 extension, the python ``list`` type.
749 MAGIC = 0x950412de
750 MAGIC_SWAPPED = 0xde120495
752 def __init__(self, *args, **kwargs):
754 Constructor, accepts all keywords arguments accepted by
755 :class:`~polib._BaseFile` class.
757 _BaseFile.__init__(self, *args, **kwargs)
758 self.magic_number = None
759 self.version = 0
761 def save_as_pofile(self, fpath):
763 Saves the mofile as a pofile to ``fpath``.
765 Keyword argument:
767 ``fpath``
768 string, full or relative path to the file.
770 _BaseFile.save(self, fpath)
772 def save(self, fpath=None):
774 Saves the mofile to ``fpath``.
776 Keyword argument:
778 ``fpath``
779 string, full or relative path to the file.
781 _BaseFile.save(self, fpath, 'to_binary')
783 def percent_translated(self):
785 Convenience method to keep the same interface with POFile instances.
787 return 100
789 def translated_entries(self):
791 Convenience method to keep the same interface with POFile instances.
793 return self
795 def untranslated_entries(self):
797 Convenience method to keep the same interface with POFile instances.
799 return []
801 def fuzzy_entries(self):
803 Convenience method to keep the same interface with POFile instances.
805 return []
807 def obsolete_entries(self):
809 Convenience method to keep the same interface with POFile instances.
811 return []
812 # }}}
813 # class _BaseEntry {{{
816 class _BaseEntry(object):
818 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
819 This class should **not** be instantiated directly.
822 def __init__(self, *args, **kwargs):
824 Constructor, accepts the following keyword arguments:
826 ``msgid``
827 string, the entry msgid.
829 ``msgstr``
830 string, the entry msgstr.
832 ``msgid_plural``
833 string, the entry msgid_plural.
835 ``msgstr_plural``
836 dict, the entry msgstr_plural lines.
838 ``msgctxt``
839 string, the entry context (msgctxt).
841 ``obsolete``
842 bool, whether the entry is "obsolete" or not.
844 ``encoding``
845 string, the encoding to use, defaults to ``default_encoding``
846 global variable (optional).
848 self.msgid = kwargs.get('msgid', '')
849 self.msgstr = kwargs.get('msgstr', '')
850 self.msgid_plural = kwargs.get('msgid_plural', '')
851 self.msgstr_plural = kwargs.get('msgstr_plural', {})
852 self.msgctxt = kwargs.get('msgctxt', None)
853 self.obsolete = kwargs.get('obsolete', False)
854 self.encoding = kwargs.get('encoding', default_encoding)
856 def __unicode__(self, wrapwidth=78):
858 Returns the unicode representation of the entry.
860 if self.obsolete:
861 delflag = '#~ '
862 else:
863 delflag = ''
864 ret = []
865 # write the msgctxt if any
866 if self.msgctxt is not None:
867 ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
868 wrapwidth)
869 # write the msgid
870 ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
871 # write the msgid_plural if any
872 if self.msgid_plural:
873 ret += self._str_field("msgid_plural", delflag, "",
874 self.msgid_plural, wrapwidth)
875 if self.msgstr_plural:
876 # write the msgstr_plural if any
877 msgstrs = self.msgstr_plural
878 keys = list(msgstrs)
879 keys.sort()
880 for index in keys:
881 msgstr = msgstrs[index]
882 plural_index = '[%s]' % index
883 ret += self._str_field("msgstr", delflag, plural_index, msgstr,
884 wrapwidth)
885 else:
886 # otherwise write the msgstr
887 ret += self._str_field("msgstr", delflag, "", self.msgstr,
888 wrapwidth)
889 ret.append('')
890 ret = u('\n').join(ret)
891 return ret
893 if PY3:
894 def __str__(self):
895 return self.__unicode__()
896 else:
897 def __str__(self):
899 Returns the string representation of the entry.
901 return unicode(self).encode(self.encoding)
903 def __eq__(self, other):
904 return str(self) == str(other)
906 def _str_field(self, fieldname, delflag, plural_index, field,
907 wrapwidth=78):
908 lines = field.splitlines(True)
909 if len(lines) > 1:
910 lines = [''] + lines # start with initial empty line
911 else:
912 escaped_field = escape(field)
913 specialchars_count = 0
914 for c in ['\\', '\n', '\r', '\t', '"']:
915 specialchars_count += field.count(c)
916 # comparison must take into account fieldname length + one space
917 # + 2 quotes (eg. msgid "<string>")
918 flength = len(fieldname) + 3
919 if plural_index:
920 flength += len(plural_index)
921 real_wrapwidth = wrapwidth - flength + specialchars_count
922 if wrapwidth > 0 and len(field) > real_wrapwidth:
923 # Wrap the line but take field name into account
924 lines = [''] + [unescape(item) for item in textwrap.wrap(
925 escaped_field,
926 wrapwidth - 2, # 2 for quotes ""
927 drop_whitespace=False,
928 break_long_words=False
930 else:
931 lines = [field]
932 if fieldname.startswith('previous_'):
933 # quick and dirty trick to get the real field name
934 fieldname = fieldname[9:]
936 ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
937 escape(lines.pop(0)))]
938 for line in lines:
939 ret.append('%s"%s"' % (delflag, escape(line)))
940 return ret
942 @property
943 def msgid_with_context(self):
944 if self.msgctxt:
945 return '%s%s%s' % (self.msgctxt, "\x04", self.msgid)
946 return self.msgid
947 # }}}
948 # class POEntry {{{
951 class POEntry(_BaseEntry):
953 Represents a po file entry.
956 def __init__(self, *args, **kwargs):
958 Constructor, accepts the following keyword arguments:
960 ``comment``
961 string, the entry comment.
963 ``tcomment``
964 string, the entry translator comment.
966 ``occurrences``
967 list, the entry occurrences.
969 ``flags``
970 list, the entry flags.
972 ``previous_msgctxt``
973 string, the entry previous context.
975 ``previous_msgid``
976 string, the entry previous msgid.
978 ``previous_msgid_plural``
979 string, the entry previous msgid_plural.
981 ``linenum``
982 integer, the line number of the entry
984 _BaseEntry.__init__(self, *args, **kwargs)
985 self.comment = kwargs.get('comment', '')
986 self.tcomment = kwargs.get('tcomment', '')
987 self.occurrences = kwargs.get('occurrences', [])
988 self.flags = kwargs.get('flags', [])
989 self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
990 self.previous_msgid = kwargs.get('previous_msgid', None)
991 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
992 self.linenum = kwargs.get('linenum', None)
994 def __unicode__(self, wrapwidth=78):
996 Returns the unicode representation of the entry.
998 ret = []
999 # comments first, if any (with text wrapping as xgettext does)
1000 if self.obsolete:
1001 comments = [('tcomment', '# ')]
1002 else:
1003 comments = [('comment', '#. '), ('tcomment', '# ')]
1004 for c in comments:
1005 val = getattr(self, c[0])
1006 if val:
1007 for comment in val.split('\n'):
1008 if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
1009 ret += textwrap.wrap(
1010 comment,
1011 wrapwidth,
1012 initial_indent=c[1],
1013 subsequent_indent=c[1],
1014 break_long_words=False
1016 else:
1017 ret.append('%s%s' % (c[1], comment))
1019 # occurrences (with text wrapping as xgettext does)
1020 if not self.obsolete and self.occurrences:
1021 filelist = []
1022 for fpath, lineno in self.occurrences:
1023 if lineno:
1024 filelist.append('%s:%s' % (fpath, lineno))
1025 else:
1026 filelist.append(fpath)
1027 filestr = ' '.join(filelist)
1028 if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
1029 # textwrap split words that contain hyphen, this is not
1030 # what we want for filenames, so the dirty hack is to
1031 # temporally replace hyphens with a char that a file cannot
1032 # contain, like "*"
1033 ret += [line.replace('*', '-') for line in textwrap.wrap(
1034 filestr.replace('-', '*'),
1035 wrapwidth,
1036 initial_indent='#: ',
1037 subsequent_indent='#: ',
1038 break_long_words=False
1040 else:
1041 ret.append('#: ' + filestr)
1043 # flags (TODO: wrapping ?)
1044 if self.flags:
1045 ret.append('#, %s' % ', '.join(self.flags))
1047 # previous context and previous msgid/msgid_plural
1048 fields = ['previous_msgctxt', 'previous_msgid',
1049 'previous_msgid_plural']
1050 if self.obsolete:
1051 prefix = "#~| "
1052 else:
1053 prefix = "#| "
1054 for f in fields:
1055 val = getattr(self, f)
1056 if val is not None:
1057 ret += self._str_field(f, prefix, "", val, wrapwidth)
1059 ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1060 ret = u('\n').join(ret)
1061 return ret
1063 def __cmp__(self, other):
1065 Called by comparison operations if rich comparison is not defined.
1067 # First: Obsolete test
1068 if self.obsolete != other.obsolete:
1069 if self.obsolete:
1070 return -1
1071 else:
1072 return 1
1073 # Work on a copy to protect original
1074 occ1 = sorted(self.occurrences[:])
1075 occ2 = sorted(other.occurrences[:])
1076 if occ1 > occ2:
1077 return 1
1078 if occ1 < occ2:
1079 return -1
1080 # Compare context
1081 msgctxt = self.msgctxt or '0'
1082 othermsgctxt = other.msgctxt or '0'
1083 if msgctxt > othermsgctxt:
1084 return 1
1085 elif msgctxt < othermsgctxt:
1086 return -1
1087 # Compare msgid_plural
1088 msgid_plural = self.msgid_plural or '0'
1089 othermsgid_plural = other.msgid_plural or '0'
1090 if msgid_plural > othermsgid_plural:
1091 return 1
1092 elif msgid_plural < othermsgid_plural:
1093 return -1
1094 # Compare msgstr_plural
1095 if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1096 msgstr_plural = list(self.msgstr_plural.values())
1097 else:
1098 msgstr_plural = []
1099 if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1100 othermsgstr_plural = list(other.msgstr_plural.values())
1101 else:
1102 othermsgstr_plural = []
1103 if msgstr_plural > othermsgstr_plural:
1104 return 1
1105 elif msgstr_plural < othermsgstr_plural:
1106 return -1
1107 # Compare msgid
1108 if self.msgid > other.msgid:
1109 return 1
1110 elif self.msgid < other.msgid:
1111 return -1
1112 # Compare msgstr
1113 if self.msgstr > other.msgstr:
1114 return 1
1115 elif self.msgstr < other.msgstr:
1116 return -1
1117 return 0
1119 def __gt__(self, other):
1120 return self.__cmp__(other) > 0
1122 def __lt__(self, other):
1123 return self.__cmp__(other) < 0
1125 def __ge__(self, other):
1126 return self.__cmp__(other) >= 0
1128 def __le__(self, other):
1129 return self.__cmp__(other) <= 0
1131 def __eq__(self, other):
1132 return self.__cmp__(other) == 0
1134 def __ne__(self, other):
1135 return self.__cmp__(other) != 0
1137 def translated(self):
1139 Returns ``True`` if the entry has been translated or ``False``
1140 otherwise.
1142 if self.obsolete or self.fuzzy:
1143 return False
1144 if self.msgstr != '':
1145 return True
1146 if self.msgstr_plural:
1147 for pos in self.msgstr_plural:
1148 if self.msgstr_plural[pos] == '':
1149 return False
1150 return True
1151 return False
1153 def merge(self, other):
1155 Merge the current entry with the given pot entry.
1157 self.msgid = other.msgid
1158 self.msgctxt = other.msgctxt
1159 self.occurrences = other.occurrences
1160 self.comment = other.comment
1161 fuzzy = self.fuzzy
1162 self.flags = other.flags[:] # clone flags
1163 if fuzzy:
1164 self.flags.append('fuzzy')
1165 self.msgid_plural = other.msgid_plural
1166 self.obsolete = other.obsolete
1167 self.previous_msgctxt = other.previous_msgctxt
1168 self.previous_msgid = other.previous_msgid
1169 self.previous_msgid_plural = other.previous_msgid_plural
1170 if other.msgstr_plural:
1171 for pos in other.msgstr_plural:
1172 try:
1173 # keep existing translation at pos if any
1174 self.msgstr_plural[pos]
1175 except KeyError:
1176 self.msgstr_plural[pos] = ''
1178 @property
1179 def fuzzy(self):
1180 return 'fuzzy' in self.flags
1182 def __hash__(self):
1183 return hash((self.msgid, self.msgstr))
1184 # }}}
1185 # class MOEntry {{{
1188 class MOEntry(_BaseEntry):
1190 Represents a mo file entry.
1192 def __init__(self, *args, **kwargs):
1194 Constructor, accepts the following keyword arguments,
1195 for consistency with :class:`~polib.POEntry`:
1197 ``comment``
1198 ``tcomment``
1199 ``occurrences``
1200 ``flags``
1201 ``previous_msgctxt``
1202 ``previous_msgid``
1203 ``previous_msgid_plural``
1205 Note: even though these keyword arguments are accepted,
1206 they hold no real meaning in the context of MO files
1207 and are simply ignored.
1209 _BaseEntry.__init__(self, *args, **kwargs)
1210 self.comment = ''
1211 self.tcomment = ''
1212 self.occurrences = []
1213 self.flags = []
1214 self.previous_msgctxt = None
1215 self.previous_msgid = None
1216 self.previous_msgid_plural = None
1218 def __hash__(self):
1219 return hash((self.msgid, self.msgstr))
1221 # }}}
1222 # class _POFileParser {{{
1225 class _POFileParser(object):
1227 A finite state machine to parse efficiently and correctly po
1228 file format.
1231 def __init__(self, pofile, *args, **kwargs):
1233 Constructor.
1235 Keyword arguments:
1237 ``pofile``
1238 string, path to the po file or its content
1240 ``encoding``
1241 string, the encoding to use, defaults to ``default_encoding``
1242 global variable (optional).
1244 ``check_for_duplicates``
1245 whether to check for duplicate entries when adding entries to the
1246 file (optional, default: ``False``).
1248 enc = kwargs.get('encoding', default_encoding)
1249 if _is_file(pofile):
1250 try:
1251 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1252 except LookupError:
1253 enc = default_encoding
1254 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1255 else:
1256 self.fhandle = pofile.splitlines()
1258 klass = kwargs.get('klass')
1259 if klass is None:
1260 klass = POFile
1261 self.instance = klass(
1262 pofile=pofile,
1263 encoding=enc,
1264 check_for_duplicates=kwargs.get('check_for_duplicates', False)
1266 self.transitions = {}
1267 self.current_line = 0
1268 self.current_entry = POEntry(linenum=self.current_line)
1269 self.current_state = 'st'
1270 self.current_token = None
1271 # two memo flags used in handlers
1272 self.msgstr_index = 0
1273 self.entry_obsolete = 0
1274 # Configure the state machine, by adding transitions.
1275 # Signification of symbols:
1276 # * ST: Beginning of the file (start)
1277 # * HE: Header
1278 # * TC: a translation comment
1279 # * GC: a generated comment
1280 # * OC: a file/line occurrence
1281 # * FL: a flags line
1282 # * CT: a message context
1283 # * PC: a previous msgctxt
1284 # * PM: a previous msgid
1285 # * PP: a previous msgid_plural
1286 # * MI: a msgid
1287 # * MP: a msgid plural
1288 # * MS: a msgstr
1289 # * MX: a msgstr plural
1290 # * MC: a msgid or msgstr continuation line
1291 all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
1292 'ms', 'mp', 'mx', 'mi']
1294 self.add('tc', ['st', 'he'], 'he')
1295 self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
1296 'mp', 'mx', 'mi'], 'tc')
1297 self.add('gc', all, 'gc')
1298 self.add('oc', all, 'oc')
1299 self.add('fl', all, 'fl')
1300 self.add('pc', all, 'pc')
1301 self.add('pm', all, 'pm')
1302 self.add('pp', all, 'pp')
1303 self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
1304 'pp', 'ms', 'mx'], 'ct')
1305 self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
1306 'pm', 'pp', 'ms', 'mx'], 'mi')
1307 self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1308 self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1309 self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1310 self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1312 def parse(self):
1314 Run the state machine, parse the file line by line and call process()
1315 with the current matched symbol.
1318 keywords = {
1319 'msgctxt': 'ct',
1320 'msgid': 'mi',
1321 'msgstr': 'ms',
1322 'msgid_plural': 'mp',
1324 prev_keywords = {
1325 'msgid_plural': 'pp',
1326 'msgid': 'pm',
1327 'msgctxt': 'pc',
1329 tokens = []
1330 fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1331 for line in self.fhandle:
1332 self.current_line += 1
1333 if self.current_line == 1:
1334 BOM = codecs.BOM_UTF8.decode('utf-8')
1335 if line.startswith(BOM):
1336 line = line[len(BOM):]
1337 line = line.strip()
1338 if line == '':
1339 continue
1341 tokens = line.split(None, 2)
1342 nb_tokens = len(tokens)
1344 if tokens[0] == '#~|':
1345 continue
1347 if tokens[0] == '#~' and nb_tokens > 1:
1348 line = line[3:].strip()
1349 tokens = tokens[1:]
1350 nb_tokens -= 1
1351 self.entry_obsolete = 1
1352 else:
1353 self.entry_obsolete = 0
1355 # Take care of keywords like
1356 # msgid, msgid_plural, msgctxt & msgstr.
1357 if tokens[0] in keywords and nb_tokens > 1:
1358 line = line[len(tokens[0]):].lstrip()
1359 if re.search(r'([^\\]|^)"', line[1:-1]):
1360 raise IOError('Syntax error in po file %s(line %s): '
1361 'unescaped double quote found' %
1362 (fpath, self.current_line))
1363 self.current_token = line
1364 self.process(keywords[tokens[0]])
1365 continue
1367 self.current_token = line
1369 if tokens[0] == '#:':
1370 if nb_tokens <= 1:
1371 continue
1372 # we are on a occurrences line
1373 self.process('oc')
1375 elif line[:1] == '"':
1376 # we are on a continuation line
1377 if re.search(r'([^\\]|^)"', line[1:-1]):
1378 raise IOError('Syntax error in po file %s(line %s): '
1379 'unescaped double quote found' %
1380 (fpath, self.current_line))
1381 self.process('mc')
1383 elif line[:7] == 'msgstr[':
1384 # we are on a msgstr plural
1385 self.process('mx')
1387 elif tokens[0] == '#,':
1388 if nb_tokens <= 1:
1389 continue
1390 # we are on a flags line
1391 self.process('fl')
1393 elif tokens[0] == '#' or tokens[0].startswith('##'):
1394 if line == '#':
1395 line += ' '
1396 # we are on a translator comment line
1397 self.process('tc')
1399 elif tokens[0] == '#.':
1400 if nb_tokens <= 1:
1401 continue
1402 # we are on a generated comment line
1403 self.process('gc')
1405 elif tokens[0] == '#|':
1406 if nb_tokens <= 1:
1407 raise IOError('Syntax error in po file %s(line %s)' %
1408 (fpath, self.current_line))
1410 # Remove the marker and any whitespace right after that.
1411 line = line[2:].lstrip()
1412 self.current_token = line
1414 if tokens[1].startswith('"'):
1415 # Continuation of previous metadata.
1416 self.process('mc')
1417 continue
1419 if nb_tokens == 2:
1420 # Invalid continuation line.
1421 raise IOError('Syntax error in po file %s(line %s): '
1422 'invalid continuation line' %
1423 (fpath, self.current_line))
1425 # we are on a "previous translation" comment line,
1426 if tokens[1] not in prev_keywords:
1427 # Unknown keyword in previous translation comment.
1428 raise IOError('Syntax error in po file %s(line %s): '
1429 'unknown keyword %s' %
1430 (fpath, self.current_line,
1431 tokens[1]))
1433 # Remove the keyword and any whitespace
1434 # between it and the starting quote.
1435 line = line[len(tokens[1]):].lstrip()
1436 self.current_token = line
1437 self.process(prev_keywords[tokens[1]])
1439 else:
1440 raise IOError('Syntax error in po file %s(line %s)' %
1441 (fpath, self.current_line))
1443 if self.current_entry and len(tokens) > 0 and \
1444 not tokens[0].startswith('#'):
1445 # since entries are added when another entry is found, we must add
1446 # the last entry here (only if there are lines). Trailing comments
1447 # are ignored
1448 self.instance.append(self.current_entry)
1450 # before returning the instance, check if there's metadata and if
1451 # so extract it in a dict
1452 metadataentry = self.instance.find('')
1453 if metadataentry: # metadata found
1454 # remove the entry
1455 self.instance.remove(metadataentry)
1456 self.instance.metadata_is_fuzzy = metadataentry.flags
1457 key = None
1458 for msg in metadataentry.msgstr.splitlines():
1459 try:
1460 key, val = msg.split(':', 1)
1461 self.instance.metadata[key] = val.strip()
1462 except (ValueError, KeyError):
1463 if key is not None:
1464 self.instance.metadata[key] += '\n' + msg.strip()
1465 # close opened file
1466 if not isinstance(self.fhandle, list): # must be file
1467 self.fhandle.close()
1468 return self.instance
1470 def add(self, symbol, states, next_state):
1472 Add a transition to the state machine.
1474 Keywords arguments:
1476 ``symbol``
1477 string, the matched token (two chars symbol).
1479 ``states``
1480 list, a list of states (two chars symbols).
1482 ``next_state``
1483 the next state the fsm will have after the action.
1485 for state in states:
1486 action = getattr(self, 'handle_%s' % next_state)
1487 self.transitions[(symbol, state)] = (action, next_state)
1489 def process(self, symbol):
1491 Process the transition corresponding to the current state and the
1492 symbol provided.
1494 Keywords arguments:
1496 ``symbol``
1497 string, the matched token (two chars symbol).
1499 ``linenum``
1500 integer, the current line number of the parsed file.
1502 try:
1503 (action, state) = self.transitions[(symbol, self.current_state)]
1504 if action():
1505 self.current_state = state
1506 except Exception:
1507 fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1508 if hasattr(self.fhandle, 'close'):
1509 self.fhandle.close()
1510 raise IOError('Syntax error in po file %s(line %s)' %
1511 (fpath, self.current_line))
1513 # state handlers
1515 def handle_he(self):
1516 """Handle a header comment."""
1517 if self.instance.header != '':
1518 self.instance.header += '\n'
1519 self.instance.header += self.current_token[2:]
1520 return 1
1522 def handle_tc(self):
1523 """Handle a translator comment."""
1524 if self.current_state in ['mc', 'ms', 'mx']:
1525 self.instance.append(self.current_entry)
1526 self.current_entry = POEntry(linenum=self.current_line)
1527 if self.current_entry.tcomment != '':
1528 self.current_entry.tcomment += '\n'
1529 tcomment = self.current_token.lstrip('#')
1530 if tcomment.startswith(' '):
1531 tcomment = tcomment[1:]
1532 self.current_entry.tcomment += tcomment
1533 return True
1535 def handle_gc(self):
1536 """Handle a generated comment."""
1537 if self.current_state in ['mc', 'ms', 'mx']:
1538 self.instance.append(self.current_entry)
1539 self.current_entry = POEntry(linenum=self.current_line)
1540 if self.current_entry.comment != '':
1541 self.current_entry.comment += '\n'
1542 self.current_entry.comment += self.current_token[3:]
1543 return True
1545 def handle_oc(self):
1546 """Handle a file:num occurrence."""
1547 if self.current_state in ['mc', 'ms', 'mx']:
1548 self.instance.append(self.current_entry)
1549 self.current_entry = POEntry(linenum=self.current_line)
1550 occurrences = self.current_token[3:].split()
1551 for occurrence in occurrences:
1552 if occurrence != '':
1553 try:
1554 fil, line = occurrence.rsplit(':', 1)
1555 if not line.isdigit():
1556 fil = occurrence
1557 line = ''
1558 self.current_entry.occurrences.append((fil, line))
1559 except (ValueError, AttributeError):
1560 self.current_entry.occurrences.append((occurrence, ''))
1561 return True
1563 def handle_fl(self):
1564 """Handle a flags line."""
1565 if self.current_state in ['mc', 'ms', 'mx']:
1566 self.instance.append(self.current_entry)
1567 self.current_entry = POEntry(linenum=self.current_line)
1568 self.current_entry.flags += [c.strip() for c in
1569 self.current_token[3:].split(',')]
1570 return True
1572 def handle_pp(self):
1573 """Handle a previous msgid_plural line."""
1574 if self.current_state in ['mc', 'ms', 'mx']:
1575 self.instance.append(self.current_entry)
1576 self.current_entry = POEntry(linenum=self.current_line)
1577 self.current_entry.previous_msgid_plural = \
1578 unescape(self.current_token[1:-1])
1579 return True
1581 def handle_pm(self):
1582 """Handle a previous msgid line."""
1583 if self.current_state in ['mc', 'ms', 'mx']:
1584 self.instance.append(self.current_entry)
1585 self.current_entry = POEntry(linenum=self.current_line)
1586 self.current_entry.previous_msgid = \
1587 unescape(self.current_token[1:-1])
1588 return True
1590 def handle_pc(self):
1591 """Handle a previous msgctxt line."""
1592 if self.current_state in ['mc', 'ms', 'mx']:
1593 self.instance.append(self.current_entry)
1594 self.current_entry = POEntry(linenum=self.current_line)
1595 self.current_entry.previous_msgctxt = \
1596 unescape(self.current_token[1:-1])
1597 return True
1599 def handle_ct(self):
1600 """Handle a msgctxt."""
1601 if self.current_state in ['mc', 'ms', 'mx']:
1602 self.instance.append(self.current_entry)
1603 self.current_entry = POEntry(linenum=self.current_line)
1604 self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1605 return True
1607 def handle_mi(self):
1608 """Handle a msgid."""
1609 if self.current_state in ['mc', 'ms', 'mx']:
1610 self.instance.append(self.current_entry)
1611 self.current_entry = POEntry(linenum=self.current_line)
1612 self.current_entry.obsolete = self.entry_obsolete
1613 self.current_entry.msgid = unescape(self.current_token[1:-1])
1614 return True
1616 def handle_mp(self):
1617 """Handle a msgid plural."""
1618 self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1619 return True
1621 def handle_ms(self):
1622 """Handle a msgstr."""
1623 self.current_entry.msgstr = unescape(self.current_token[1:-1])
1624 return True
1626 def handle_mx(self):
1627 """Handle a msgstr plural."""
1628 index = self.current_token[7]
1629 value = self.current_token[self.current_token.find('"') + 1:-1]
1630 self.current_entry.msgstr_plural[int(index)] = unescape(value)
1631 self.msgstr_index = int(index)
1632 return True
1634 def handle_mc(self):
1635 """Handle a msgid or msgstr continuation line."""
1636 token = unescape(self.current_token[1:-1])
1637 if self.current_state == 'ct':
1638 self.current_entry.msgctxt += token
1639 elif self.current_state == 'mi':
1640 self.current_entry.msgid += token
1641 elif self.current_state == 'mp':
1642 self.current_entry.msgid_plural += token
1643 elif self.current_state == 'ms':
1644 self.current_entry.msgstr += token
1645 elif self.current_state == 'mx':
1646 self.current_entry.msgstr_plural[self.msgstr_index] += token
1647 elif self.current_state == 'pp':
1648 self.current_entry.previous_msgid_plural += token
1649 elif self.current_state == 'pm':
1650 self.current_entry.previous_msgid += token
1651 elif self.current_state == 'pc':
1652 self.current_entry.previous_msgctxt += token
1653 # don't change the current state
1654 return False
1655 # }}}
1656 # class _MOFileParser {{{
1659 class _MOFileParser(object):
1661 A class to parse binary mo files.
1664 def __init__(self, mofile, *args, **kwargs):
1666 Constructor.
1668 Keyword arguments:
1670 ``mofile``
1671 string, path to the mo file or its content
1673 ``encoding``
1674 string, the encoding to use, defaults to ``default_encoding``
1675 global variable (optional).
1677 ``check_for_duplicates``
1678 whether to check for duplicate entries when adding entries to the
1679 file (optional, default: ``False``).
1681 if _is_file(mofile):
1682 self.fhandle = open(mofile, 'rb')
1683 else:
1684 self.fhandle = io.BytesIO(mofile)
1686 klass = kwargs.get('klass')
1687 if klass is None:
1688 klass = MOFile
1689 self.instance = klass(
1690 fpath=mofile,
1691 encoding=kwargs.get('encoding', default_encoding),
1692 check_for_duplicates=kwargs.get('check_for_duplicates', False)
1695 def __del__(self):
1697 Make sure the file is closed, this prevents warnings on unclosed file
1698 when running tests with python >= 3.2.
1700 if self.fhandle and hasattr(self.fhandle, 'close'):
1701 self.fhandle.close()
1703 def parse(self):
1705 Build the instance with the file handle provided in the
1706 constructor.
1708 # parse magic number
1709 magic_number = self._readbinary('<I', 4)
1710 if magic_number == MOFile.MAGIC:
1711 ii = '<II'
1712 elif magic_number == MOFile.MAGIC_SWAPPED:
1713 ii = '>II'
1714 else:
1715 raise IOError('Invalid mo file, magic number is incorrect !')
1716 self.instance.magic_number = magic_number
1717 # parse the version number and the number of strings
1718 version, numofstrings = self._readbinary(ii, 8)
1719 # from MO file format specs: "A program seeing an unexpected major
1720 # revision number should stop reading the MO file entirely"
1721 if version >> 16 not in (0, 1):
1722 raise IOError('Invalid mo file, unexpected major revision number')
1723 self.instance.version = version
1724 # original strings and translation strings hash table offset
1725 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1726 # move to msgid hash table and read length and offset of msgids
1727 self.fhandle.seek(msgids_hash_offset)
1728 msgids_index = []
1729 for i in range(numofstrings):
1730 msgids_index.append(self._readbinary(ii, 8))
1731 # move to msgstr hash table and read length and offset of msgstrs
1732 self.fhandle.seek(msgstrs_hash_offset)
1733 msgstrs_index = []
1734 for i in range(numofstrings):
1735 msgstrs_index.append(self._readbinary(ii, 8))
1736 # build entries
1737 encoding = self.instance.encoding
1738 for i in range(numofstrings):
1739 self.fhandle.seek(msgids_index[i][1])
1740 msgid = self.fhandle.read(msgids_index[i][0])
1742 self.fhandle.seek(msgstrs_index[i][1])
1743 msgstr = self.fhandle.read(msgstrs_index[i][0])
1744 if i == 0 and not msgid: # metadata
1745 raw_metadata, metadata = msgstr.split(b('\n')), {}
1746 for line in raw_metadata:
1747 tokens = line.split(b(':'), 1)
1748 if tokens[0] != b(''):
1749 try:
1750 k = tokens[0].decode(encoding)
1751 v = tokens[1].decode(encoding)
1752 metadata[k] = v.strip()
1753 except IndexError:
1754 metadata[k] = u('')
1755 self.instance.metadata = metadata
1756 continue
1757 # test if we have a plural entry
1758 msgid_tokens = msgid.split(b('\0'))
1759 if len(msgid_tokens) > 1:
1760 entry = self._build_entry(
1761 msgid=msgid_tokens[0],
1762 msgid_plural=msgid_tokens[1],
1763 msgstr_plural=dict((k, v) for k, v in
1764 enumerate(msgstr.split(b('\0'))))
1766 else:
1767 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1768 self.instance.append(entry)
1769 # close opened file
1770 self.fhandle.close()
1771 return self.instance
1773 def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
1774 msgstr_plural=None):
1775 msgctxt_msgid = msgid.split(b('\x04'))
1776 encoding = self.instance.encoding
1777 if len(msgctxt_msgid) > 1:
1778 kwargs = {
1779 'msgctxt': msgctxt_msgid[0].decode(encoding),
1780 'msgid': msgctxt_msgid[1].decode(encoding),
1782 else:
1783 kwargs = {'msgid': msgid.decode(encoding)}
1784 if msgstr:
1785 kwargs['msgstr'] = msgstr.decode(encoding)
1786 if msgid_plural:
1787 kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1788 if msgstr_plural:
1789 for k in msgstr_plural:
1790 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1791 kwargs['msgstr_plural'] = msgstr_plural
1792 return MOEntry(**kwargs)
1794 def _readbinary(self, fmt, numbytes):
1796 Private method that unpack n bytes of data using format <fmt>.
1797 It returns a tuple or a mixed value if the tuple length is 1.
1799 bytes = self.fhandle.read(numbytes)
1800 tup = struct.unpack(fmt, bytes)
1801 if len(tup) == 1:
1802 return tup[0]
1803 return tup
1804 # }}}