cola/polib.py

   1 #
   2 # License: MIT (see extras/polib/LICENSE file provided)
   3 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
   4 # pylint: disable=consider-using-with,no-else-return
   5
   6 """
   7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
   8 mo files).  You can load existing files, iterate through it's entries, add,
   9 modify entries, comments or metadata, etc. or create new po files from scratch.
  10
  11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
  12 :func:`~polib.mofile` convenience functions.
  13 """
  14 import array
  15 import codecs
  16 import os
  17 import re
  18 import struct
  19 import sys
  20 import textwrap
  21 import io
  22
  23 from . import compat
  24
  25
  26 __author__ = 'David Jean Louis <izimobil@gmail.com>'
  27 __version__ = '1.1.1'
  28 __all__ = [
  29     'pofile',
  30     'POFile',
  31     'POEntry',
  32     'mofile',
  33     'MOFile',
  34     'MOEntry',
  35     'default_encoding',
  36     'escape',
  37     'unescape',
  38     'detect_encoding',
  39 ]
  40
  41
  42 # the default encoding to use when encoding cannot be detected
  43 default_encoding = 'utf-8'
  44
  45 # python 2/3 compatibility helpers {{{
  46
  47
  48 PY3 = True
  49 text_type = str
  50
  51
  52 def b(s):
  53     return s.encode('utf-8')
  54
  55
  56 def u(s):
  57     return s
  58
  59
  60 # }}}
  61 # _pofile_or_mofile {{{
  62
  63
  64 def _pofile_or_mofile(f, filetype, **kwargs):
  65     """
  66     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
  67     honor the DRY concept.
  68     """
  69     # get the file encoding
  70     enc = kwargs.get('encoding')
  71     if enc is None:
  72         enc = detect_encoding(f, filetype == 'mofile')
  73
  74     # parse the file
  75     kls = _POFileParser if filetype == 'pofile' else _MOFileParser
  76     parser = kls(
  77         f,
  78         encoding=enc,
  79         check_for_duplicates=kwargs.get('check_for_duplicates', False),
  80         klass=kwargs.get('klass'),
  81     )
  82     instance = parser.parse()
  83     instance.wrapwidth = kwargs.get('wrapwidth', 78)
  84     return instance
  85
  86
  87 # }}}
  88 # _is_file {{{
  89
  90
  91 def _is_file(filename_or_contents):
  92     """
  93     Safely returns the value of os.path.exists(filename_or_contents).
  94
  95     Arguments:
  96
  97     ``filename_or_contents``
  98         either a filename, or a string holding the contents of some file.
  99         In the latter case, this function will always return False.
 100     """
 101     try:
 102         return os.path.isfile(filename_or_contents)
 103     except (TypeError, ValueError, UnicodeEncodeError):
 104         return False
 105
 106
 107 # }}}
 108 # function pofile() {{{
 109
 110
 111 # pylint: disable=redefined-outer-name
 112 def pofile(pofile, **kwargs):
 113     """
 114     Convenience function that parses the po or pot file ``pofile`` and returns
 115     a :class:`~polib.POFile` instance.
 116
 117     Arguments:
 118
 119     ``pofile``
 120         string, full or relative path to the po/pot file or its content (data).
 121
 122     ``wrapwidth``
 123         integer, the wrap width, only useful when the ``-w`` option was passed
 124         to xgettext (optional, default: ``78``).
 125
 126     ``encoding``
 127         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 128         encoding will be auto-detected).
 129
 130     ``check_for_duplicates``
 131         whether to check for duplicate entries when adding entries to the
 132         file (optional, default: ``False``).
 133
 134     ``klass``
 135         class which is used to instantiate the return value (optional,
 136         default: ``None``, the return value with be a :class:`~polib.POFile`
 137         instance).
 138     """
 139     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
 140
 141
 142 # }}}
 143 # function mofile() {{{
 144
 145
 146 # pylint: disable=redefined-outer-name
 147 def mofile(mofile, **kwargs):
 148     """
 149     Convenience function that parses the mo file ``mofile`` and returns a
 150     :class:`~polib.MOFile` instance.
 151
 152     Arguments:
 153
 154     ``mofile``
 155         string, full or relative path to the mo file or its content (string
 156         or bytes).
 157
 158     ``wrapwidth``
 159         integer, the wrap width, only useful when the ``-w`` option was passed
 160         to xgettext to generate the po file that was used to format the mo file
 161         (optional, default: ``78``).
 162
 163     ``encoding``
 164         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 165         encoding will be auto-detected).
 166
 167     ``check_for_duplicates``
 168         whether to check for duplicate entries when adding entries to the
 169         file (optional, default: ``False``).
 170
 171     ``klass``
 172         class which is used to instantiate the return value (optional,
 173         default: ``None``, the return value with be a :class:`~polib.POFile`
 174         instance).
 175     """
 176     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
 177
 178
 179 # }}}
 180 # function detect_encoding() {{{
 181
 182
 183 def detect_encoding(file, binary_mode=False):
 184     """
 185     Try to detect the encoding used by the ``file``. The ``file`` argument can
 186     be a PO or MO file path or a string containing the contents of the file.
 187     If the encoding cannot be detected, the function will return the value of
 188     ``default_encoding``.
 189
 190     Arguments:
 191
 192     ``file``
 193         string, full or relative path to the po/mo file or its content.
 194
 195     ``binary_mode``
 196         boolean, set this to True if ``file`` is a mo file.
 197     """
 198     PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
 199     rxt = re.compile(u(PATTERN))
 200     rxb = re.compile(b(PATTERN))
 201
 202     def charset_exists(charset):
 203         """Check whether ``charset`` is valid or not."""
 204         try:
 205             codecs.lookup(charset)
 206         except LookupError:
 207             return False
 208         return True
 209
 210     if not _is_file(file):
 211         try:
 212             match = rxt.search(file)
 213         except TypeError:
 214             match = rxb.search(file)
 215         if match:
 216             enc = match.group(1).strip()
 217             if not isinstance(enc, text_type):
 218                 enc = enc.decode('utf-8')
 219             if charset_exists(enc):
 220                 return enc
 221     else:
 222         # For PY3, always treat as binary
 223         if binary_mode or PY3:
 224             mode = 'rb'
 225             rx = rxb
 226         else:
 227             mode = 'r'
 228             rx = rxt
 229         f = open(file, mode)
 230         for line in f.readlines():
 231             match = rx.search(line)
 232             if match:
 233                 f.close()
 234                 enc = match.group(1).strip()
 235                 if not isinstance(enc, text_type):
 236                     enc = enc.decode('utf-8')
 237                 if charset_exists(enc):
 238                     return enc
 239         f.close()
 240     return default_encoding
 241
 242
 243 # }}}
 244 # function escape() {{{
 245
 246
 247 def escape(st):
 248     """
 249     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 250     the given string ``st`` and returns it.
 251     """
 252     return (
 253         st.replace('\\', r'\\')
 254         .replace('\t', r'\t')
 255         .replace('\r', r'\r')
 256         .replace('\n', r'\n')
 257         .replace('"', r'\"')
 258     )
 259
 260
 261 # }}}
 262 # function unescape() {{{
 263
 264
 265 def unescape(st):
 266     """
 267     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 268     the given string ``st`` and returns it.
 269     """
 270
 271     def unescape_repl(m):
 272         m = m.group(1)
 273         if m == 'n':
 274             return '\n'
 275         if m == 't':
 276             return '\t'
 277         if m == 'r':
 278             return '\r'
 279         if m == '\\':
 280             return '\\'
 281         return m  # handles escaped double quote
 282
 283     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
 284
 285
 286 # }}}
 287 # function natural_sort() {{{
 288
 289
 290 def natural_sort(lst):
 291     """
 292     Sort naturally the given list.
 293     Credits: http://stackoverflow.com/a/4836734
 294     """
 295
 296     def convert(text):
 297         return int(text) if text.isdigit() else text.lower()
 298
 299     def alphanum_key(key):
 300         return [convert(c) for c in re.split('([0-9]+)', key)]
 301
 302     return sorted(lst, key=alphanum_key)
 303
 304
 305 # }}}
 306 # class _BaseFile {{{
 307
 308
 309 class _BaseFile(list):
 310     """
 311     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
 312     classes. This class should **not** be instantiated directly.
 313     """
 314
 315     def __init__(self, *_args, **kwargs):
 316         """
 317         Constructor, accepts the following keyword arguments:
 318
 319         ``pofile``
 320             string, the path to the po or mo file, or its content as a string.
 321
 322         ``wrapwidth``
 323             integer, the wrap width, only useful when the ``-w`` option was
 324             passed to xgettext (optional, default: ``78``).
 325
 326         ``encoding``
 327             string, the encoding to use, defaults to ``default_encoding``
 328             global variable (optional).
 329
 330         ``check_for_duplicates``
 331             whether to check for duplicate entries when adding entries to the
 332             file, (optional, default: ``False``).
 333         """
 334         list.__init__(self)
 335         # the opened file handle
 336         pofile = kwargs.get('pofile', None)  # pylint: disable=redefined-outer-name
 337         if pofile and _is_file(pofile):
 338             self.fpath = pofile
 339         else:
 340             self.fpath = kwargs.get('fpath')
 341         # the width at which lines should be wrapped
 342         self.wrapwidth = kwargs.get('wrapwidth', 78)
 343         # the file encoding
 344         self.encoding = kwargs.get('encoding', default_encoding)
 345         # whether to check for duplicate entries or not
 346         self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
 347         # header
 348         self.header = ''
 349         # both po and mo files have metadata
 350         self.metadata = {}
 351         self.metadata_is_fuzzy = 0
 352
 353     def __unicode__(self):
 354         """
 355         Returns the unicode representation of the file.
 356         """
 357         ret = []
 358         entries = [self.metadata_as_entry()] + [e for e in self if not e.obsolete]
 359         for entry in entries:
 360             ret.append(entry.__unicode__(self.wrapwidth))
 361         for entry in self.obsolete_entries():  # pylint: disable=no-member
 362             ret.append(entry.__unicode__(self.wrapwidth))
 363         ret = u('\n').join(ret)
 364         return ret
 365
 366     if PY3:
 367
 368         def __str__(self):
 369             return self.__unicode__()
 370
 371     else:
 372
 373         def __str__(self):
 374             """
 375             Returns the string representation of the file.
 376             """
 377             return compat.ustr(self).encode(self.encoding)
 378
 379     def __contains__(self, entry):
 380         """
 381         Overridden ``list`` method to implement the membership test (in and
 382         not in).
 383         The method considers that an entry is in the file if it finds an entry
 384         that has the same msgid (the test is **case sensitive**) and the same
 385         msgctxt (or none for both entries).
 386
 387         Argument:
 388
 389         ``entry``
 390             an instance of :class:`~polib._BaseEntry`.
 391         """
 392         return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) is not None
 393
 394     def __eq__(self, other):
 395         return str(self) == str(other)
 396
 397     def __hash__(self):
 398         return hash(str(self))
 399
 400     def append(self, entry):
 401         """
 402         Overridden method to check for duplicates entries, if a user tries to
 403         add an entry that is already in the file, the method will raise a
 404         ``ValueError`` exception.
 405
 406         Argument:
 407
 408         ``entry``
 409             an instance of :class:`~polib._BaseEntry`.
 410         """
 411         # check_for_duplicates may not be defined (yet) when unpickling.
 412         # But if pickling, we never want to check for duplicates anyway.
 413         if getattr(self, 'check_for_duplicates', False) and entry in self:
 414             raise ValueError('Entry "%s" already exists' % entry.msgid)
 415         super().append(entry)
 416
 417     def insert(self, index, entry):
 418         """
 419         Overridden method to check for duplicates entries, if a user tries to
 420         add an entry that is already in the file, the method will raise a
 421         ``ValueError`` exception.
 422
 423         Arguments:
 424
 425         ``index``
 426             index at which the entry should be inserted.
 427
 428         ``entry``
 429             an instance of :class:`~polib._BaseEntry`.
 430         """
 431         if self.check_for_duplicates and entry in self:
 432             raise ValueError('Entry "%s" already exists' % entry.msgid)
 433         super().insert(index, entry)
 434
 435     def metadata_as_entry(self):
 436         """
 437         Returns the file metadata as a :class:`~polib.POFile` instance.
 438         """
 439         e = POEntry(msgid='')
 440         mdata = self.ordered_metadata()
 441         if mdata:
 442             strs = []
 443             for name, value in mdata:
 444                 # Strip whitespace off each line in a multi-line entry
 445                 strs.append(f'{name}: {value}')
 446             e.msgstr = '\n'.join(strs) + '\n'
 447         if self.metadata_is_fuzzy:
 448             e.flags.append('fuzzy')
 449         return e
 450
 451     def save(self, fpath=None, repr_method='__unicode__', newline=None):
 452         """
 453         Saves the po file to ``fpath``.
 454         If it is an existing file and no ``fpath`` is provided, then the
 455         existing file is rewritten with the modified data.
 456
 457         Keyword arguments:
 458
 459         ``fpath``
 460             string, full or relative path to the file.
 461
 462         ``repr_method``
 463             string, the method to use for output.
 464
 465         ``newline``
 466             string, controls how universal newlines works
 467         """
 468         if self.fpath is None and fpath is None:
 469             raise OSError('You must provide a file path to save() method')
 470         contents = getattr(self, repr_method)()
 471         if fpath is None:
 472             fpath = self.fpath
 473         if repr_method == 'to_binary':
 474             fhandle = open(fpath, 'wb')
 475         else:
 476             fhandle = open(fpath, 'w', encoding=self.encoding, newline=newline)
 477             if not isinstance(contents, text_type):
 478                 contents = contents.decode(self.encoding)
 479         fhandle.write(contents)
 480         fhandle.close()
 481         # set the file path if not set
 482         if self.fpath is None and fpath:
 483             self.fpath = fpath
 484
 485     def find(self, st, by='msgid', include_obsolete_entries=False, msgctxt=False):
 486         """
 487         Find the entry which msgid (or property identified by the ``by``
 488         argument) matches the string ``st``.
 489
 490         Keyword arguments:
 491
 492         ``st``
 493             string, the string to search for.
 494
 495         ``by``
 496             string, the property to use for comparison (default: ``msgid``).
 497
 498         ``include_obsolete_entries``
 499             boolean, whether to also search in entries that are obsolete.
 500
 501         ``msgctxt``
 502             string, allows specifying a specific message context for the
 503             search.
 504         """
 505         if include_obsolete_entries:
 506             entries = self[:]
 507         else:
 508             entries = [e for e in self if not e.obsolete]
 509         matches = []
 510         for e in entries:
 511             if getattr(e, by) == st:
 512                 if msgctxt is not False and e.msgctxt != msgctxt:
 513                     continue
 514                 matches.append(e)
 515         if len(matches) == 1:
 516             return matches[0]
 517         elif len(matches) > 1:
 518             if not msgctxt:
 519                 # find the entry with no msgctx
 520                 e = None
 521                 for m in matches:
 522                     if not m.msgctxt:
 523                         e = m
 524                 if e:
 525                     return e
 526                 # fallback to the first entry found
 527                 return matches[0]
 528         return None
 529
 530     def ordered_metadata(self):
 531         """
 532         Convenience method that returns an ordered version of the metadata
 533         dictionary. The return value is list of tuples (metadata name,
 534         metadata_value).
 535         """
 536         # copy the dict first
 537         metadata = self.metadata.copy()
 538         data_order = [
 539             'Project-Id-Version',
 540             'Report-Msgid-Bugs-To',
 541             'POT-Creation-Date',
 542             'PO-Revision-Date',
 543             'Last-Translator',
 544             'Language-Team',
 545             'Language',
 546             'MIME-Version',
 547             'Content-Type',
 548             'Content-Transfer-Encoding',
 549             'Plural-Forms',
 550         ]
 551         ordered_data = []
 552         for data in data_order:
 553             try:
 554                 value = metadata.pop(data)
 555                 ordered_data.append((data, value))
 556             except KeyError:
 557                 pass
 558         # the rest of the metadata will be alphabetically ordered since there
 559         # are no specs for this AFAIK
 560         for data in natural_sort(metadata.keys()):
 561             value = metadata[data]
 562             ordered_data.append((data, value))
 563         return ordered_data
 564
 565     def to_binary(self):
 566         """
 567         Return the binary representation of the file.
 568         """
 569         offsets = []
 570         entries = self.translated_entries()  # pylint: disable=no-member
 571
 572         # the keys are sorted in the .mo file
 573         def cmp(_self, other):  # pylint: disable=unused-variable
 574             # msgfmt compares entries with msgctxt if it exists
 575             self_msgid = _self.msgctxt or _self.msgid
 576             other_msgid = other.msgctxt or other.msgid
 577             if self_msgid > other_msgid:
 578                 return 1
 579             elif self_msgid < other_msgid:
 580                 return -1
 581             else:
 582                 return 0
 583
 584         # add metadata entry
 585         entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
 586         mentry = self.metadata_as_entry()
 587         entries = [mentry] + entries
 588         entries_len = len(entries)
 589         ids, strs = b(''), b('')
 590         for e in entries:
 591             # For each string, we need size and file offset.  Each string is
 592             # NUL terminated; the NUL does not count into the size.
 593             msgid = b('')
 594             if e.msgctxt:
 595                 # Contexts are stored by storing the concatenation of the
 596                 # context, a <EOT> byte, and the original string
 597                 msgid = self._encode(e.msgctxt + '\4')
 598             if e.msgid_plural:
 599                 msgstr = []
 600                 for index in sorted(e.msgstr_plural.keys()):
 601                     msgstr.append(e.msgstr_plural[index])
 602                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
 603                 msgstr = self._encode('\0'.join(msgstr))
 604             else:
 605                 msgid += self._encode(e.msgid)
 606                 msgstr = self._encode(e.msgstr)
 607             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
 608             ids += msgid + b('\0')
 609             strs += msgstr + b('\0')
 610
 611         # The header is 7 32-bit unsigned integers.
 612         keystart = 7 * 4 + 16 * entries_len
 613         # and the values start after the keys
 614         valuestart = keystart + len(ids)
 615         koffsets = []
 616         voffsets = []
 617         # The string table first has the list of keys, then the list of values.
 618         # Each entry has first the size of the string, then the file offset.
 619         for o1, l1, o2, l2 in offsets:
 620             koffsets += [l1, o1 + keystart]
 621             voffsets += [l2, o2 + valuestart]
 622         offsets = koffsets + voffsets
 623
 624         output = struct.pack(
 625             'Iiiiiii',
 626             # Magic number
 627             MOFile.MAGIC,
 628             # Version
 629             0,
 630             # number of entries
 631             entries_len,
 632             # start of key index
 633             7 * 4,
 634             # start of value index
 635             7 * 4 + entries_len * 8,
 636             # size and offset of hash table, we don't use hash tables
 637             0,
 638             keystart,
 639         )
 640         if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
 641             output += array.array('i', offsets).tobytes()
 642         else:
 643             output += array.array('i', offsets).tostring()  # pylint: disable=no-member
 644         output += ids
 645         output += strs
 646         return output
 647
 648     def _encode(self, mixed):
 649         """
 650         Encodes the given ``mixed`` argument with the file encoding if and
 651         only if it's an unicode string and returns the encoded string.
 652         """
 653         if isinstance(mixed, text_type):
 654             mixed = mixed.encode(self.encoding)
 655         return mixed
 656
 657
 658 # }}}
 659 # class POFile {{{
 660
 661
 662 class POFile(_BaseFile):
 663     """
 664     Po (or Pot) file reader/writer.
 665     This class inherits the :class:`~polib._BaseFile` class and, by extension,
 666     the python ``list`` type.
 667     """
 668
 669     def __unicode__(self):
 670         """
 671         Returns the unicode representation of the po file.
 672         """
 673         ret, headers = '', self.header.split('\n')
 674         for header in headers:
 675             if not header:
 676                 ret += '#\n'
 677             elif header[:1] in [',', ':']:
 678                 ret += '#%s\n' % header
 679             else:
 680                 ret += '# %s\n' % header
 681
 682         if not isinstance(ret, text_type):
 683             ret = ret.decode(self.encoding)
 684
 685         return ret + _BaseFile.__unicode__(self)
 686
 687     def save_as_mofile(self, fpath):
 688         """
 689         Saves the binary representation of the file to given ``fpath``.
 690
 691         Keyword argument:
 692
 693         ``fpath``
 694             string, full or relative path to the mo file.
 695         """
 696         _BaseFile.save(self, fpath, 'to_binary')
 697
 698     def percent_translated(self):
 699         """
 700         Convenience method that returns the percentage of translated
 701         messages.
 702         """
 703         total = len([e for e in self if not e.obsolete])
 704         if total == 0:
 705             return 100
 706         translated = len(self.translated_entries())
 707         return int(translated * 100 / float(total))
 708
 709     def translated_entries(self):
 710         """
 711         Convenience method that returns the list of translated entries.
 712         """
 713         return [e for e in self if e.translated()]
 714
 715     def untranslated_entries(self):
 716         """
 717         Convenience method that returns the list of untranslated entries.
 718         """
 719         return [
 720             e for e in self if not e.translated() and not e.obsolete and not e.fuzzy
 721         ]
 722
 723     def fuzzy_entries(self):
 724         """
 725         Convenience method that returns the list of fuzzy entries.
 726         """
 727         return [e for e in self if e.fuzzy and not e.obsolete]
 728
 729     def obsolete_entries(self):
 730         """
 731         Convenience method that returns the list of obsolete entries.
 732         """
 733         return [e for e in self if e.obsolete]
 734
 735     def merge(self, refpot):
 736         """
 737         Convenience method that merges the current pofile with the pot file
 738         provided. It behaves exactly as the gettext msgmerge utility:
 739
 740         * comments of this file will be preserved, but extracted comments and
 741           occurrences will be discarded;
 742         * any translations or comments in the file will be discarded, however,
 743           dot comments and file positions will be preserved;
 744         * the fuzzy flags are preserved.
 745
 746         Keyword argument:
 747
 748         ``refpot``
 749             object POFile, the reference catalog.
 750         """
 751         # Store entries in dict/set for faster access
 752         self_entries = {entry.msgid_with_context: entry for entry in self}
 753         refpot_msgids = {entry.msgid_with_context for entry in refpot}
 754         # Merge entries that are in the refpot
 755         for entry in refpot:
 756             e = self_entries.get(entry.msgid_with_context)
 757             if e is None:
 758                 e = POEntry()
 759                 self.append(e)
 760             e.merge(entry)
 761         # ok, now we must "obsolete" entries that are not in the refpot anymore
 762         for entry in self:
 763             if entry.msgid_with_context not in refpot_msgids:
 764                 entry.obsolete = True
 765
 766
 767 # }}}
 768 # class MOFile {{{
 769
 770
 771 class MOFile(_BaseFile):
 772     """
 773     Mo file reader/writer.
 774     This class inherits the :class:`~polib._BaseFile` class and, by
 775     extension, the python ``list`` type.
 776     """
 777
 778     MAGIC = 0x950412DE
 779     MAGIC_SWAPPED = 0xDE120495
 780
 781     def __init__(self, *args, **kwargs):
 782         """
 783         Constructor, accepts all keywords arguments accepted by
 784         :class:`~polib._BaseFile` class.
 785         """
 786         _BaseFile.__init__(self, *args, **kwargs)
 787         self.magic_number = None
 788         self.version = 0
 789
 790     def save_as_pofile(self, fpath):
 791         """
 792         Saves the mofile as a pofile to ``fpath``.
 793
 794         Keyword argument:
 795
 796         ``fpath``
 797             string, full or relative path to the file.
 798         """
 799         _BaseFile.save(self, fpath)
 800
 801     # pylint: disable=arguments-differ
 802     def save(self, fpath=None):
 803         """
 804         Saves the mofile to ``fpath``.
 805
 806         Keyword argument:
 807
 808         ``fpath``
 809             string, full or relative path to the file.
 810         """
 811         _BaseFile.save(self, fpath, 'to_binary')
 812
 813     def percent_translated(self):
 814         """
 815         Convenience method to keep the same interface with POFile instances.
 816         """
 817         return 100
 818
 819     def translated_entries(self):
 820         """
 821         Convenience method to keep the same interface with POFile instances.
 822         """
 823         return self
 824
 825     def untranslated_entries(self):
 826         """
 827         Convenience method to keep the same interface with POFile instances.
 828         """
 829         return []
 830
 831     def fuzzy_entries(self):
 832         """
 833         Convenience method to keep the same interface with POFile instances.
 834         """
 835         return []
 836
 837     def obsolete_entries(self):
 838         """
 839         Convenience method to keep the same interface with POFile instances.
 840         """
 841         return []
 842
 843
 844 # }}}
 845 # class _BaseEntry {{{
 846
 847
 848 class _BaseEntry:
 849     """
 850     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
 851     This class should **not** be instantiated directly.
 852     """
 853
 854     def __init__(self, *_args, **kwargs):
 855         """
 856         Constructor, accepts the following keyword arguments:
 857
 858         ``msgid``
 859             string, the entry msgid.
 860
 861         ``msgstr``
 862             string, the entry msgstr.
 863
 864         ``msgid_plural``
 865             string, the entry msgid_plural.
 866
 867         ``msgstr_plural``
 868             dict, the entry msgstr_plural lines.
 869
 870         ``msgctxt``
 871             string, the entry context (msgctxt).
 872
 873         ``obsolete``
 874             bool, whether the entry is "obsolete" or not.
 875
 876         ``encoding``
 877             string, the encoding to use, defaults to ``default_encoding``
 878             global variable (optional).
 879         """
 880         self.msgid = kwargs.get('msgid', '')
 881         self.msgstr = kwargs.get('msgstr', '')
 882         self.msgid_plural = kwargs.get('msgid_plural', '')
 883         self.msgstr_plural = kwargs.get('msgstr_plural', {})
 884         self.msgctxt = kwargs.get('msgctxt', None)
 885         self.obsolete = kwargs.get('obsolete', False)
 886         self.encoding = kwargs.get('encoding', default_encoding)
 887
 888     def __unicode__(self, wrapwidth=78):
 889         """
 890         Returns the unicode representation of the entry.
 891         """
 892         if self.obsolete:
 893             delflag = '#~ '
 894         else:
 895             delflag = ''
 896         ret = []
 897         # write the msgctxt if any
 898         if self.msgctxt is not None:
 899             ret += self._str_field('msgctxt', delflag, '', self.msgctxt, wrapwidth)
 900         # write the msgid
 901         ret += self._str_field('msgid', delflag, '', self.msgid, wrapwidth)
 902         # write the msgid_plural if any
 903         if self.msgid_plural:
 904             ret += self._str_field(
 905                 'msgid_plural', delflag, '', self.msgid_plural, wrapwidth
 906             )
 907         if self.msgstr_plural:
 908             # write the msgstr_plural if any
 909             msgstrs = self.msgstr_plural
 910             keys = list(msgstrs)
 911             keys.sort()
 912             for index in keys:
 913                 msgstr = msgstrs[index]
 914                 plural_index = '[%s]' % index
 915                 ret += self._str_field(
 916                     'msgstr', delflag, plural_index, msgstr, wrapwidth
 917                 )
 918         else:
 919             # otherwise write the msgstr
 920             ret += self._str_field('msgstr', delflag, '', self.msgstr, wrapwidth)
 921         ret.append('')
 922         ret = u('\n').join(ret)
 923         return ret
 924
 925     if PY3:
 926
 927         def __str__(self):
 928             return self.__unicode__()
 929
 930     else:
 931
 932         def __str__(self):
 933             """
 934             Returns the string representation of the entry.
 935             """
 936             return compat.ustr(self).encode(self.encoding)
 937
 938     def __eq__(self, other):
 939         return str(self) == str(other)
 940
 941     def __hash__(self):
 942         return hash(str(self))
 943
 944     def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78):
 945         lines = field.splitlines(True)
 946         if len(lines) > 1:
 947             lines = [''] + lines  # start with initial empty line
 948         else:
 949             escaped_field = escape(field)
 950             specialchars_count = 0
 951             for c in ['\\', '\n', '\r', '\t', '"']:
 952                 specialchars_count += field.count(c)
 953             # comparison must take into account fieldname length + one space
 954             # + 2 quotes (eg. msgid "<string>")
 955             flength = len(fieldname) + 3
 956             if plural_index:
 957                 flength += len(plural_index)
 958             real_wrapwidth = wrapwidth - flength + specialchars_count
 959             if wrapwidth > 0 and len(field) > real_wrapwidth:
 960                 # Wrap the line but take field name into account
 961                 lines = [''] + [
 962                     unescape(item)
 963                     for item in textwrap.wrap(
 964                         escaped_field,
 965                         wrapwidth - 2,  # 2 for quotes ""
 966                         drop_whitespace=False,
 967                         break_long_words=False,
 968                     )
 969                 ]
 970             else:
 971                 lines = [field]
 972         if fieldname.startswith('previous_'):
 973             # quick and dirty trick to get the real field name
 974             fieldname = fieldname[9:]
 975
 976         ret = [f'{delflag}{fieldname}{plural_index} "{escape(lines.pop(0))}"']
 977         for line in lines:
 978             ret.append(f'{delflag}"{escape(line)}"')
 979         return ret
 980
 981     @property
 982     def msgid_with_context(self):
 983         if self.msgctxt:
 984             return '{}{}{}'.format(self.msgctxt, '\x04', self.msgid)
 985         return self.msgid
 986
 987
 988 # }}}
 989 # class POEntry {{{
 990
 991
 992 class POEntry(_BaseEntry):
 993     """
 994     Represents a po file entry.
 995     """
 996
 997     def __init__(self, *args, **kwargs):
 998         """
 999         Constructor, accepts the following keyword arguments:
1000
1001         ``comment``
1002             string, the entry comment.
1003
1004         ``tcomment``
1005             string, the entry translator comment.
1006
1007         ``occurrences``
1008             list, the entry occurrences.
1009
1010         ``flags``
1011             list, the entry flags.
1012
1013         ``previous_msgctxt``
1014             string, the entry previous context.
1015
1016         ``previous_msgid``
1017             string, the entry previous msgid.
1018
1019         ``previous_msgid_plural``
1020             string, the entry previous msgid_plural.
1021
1022         ``linenum``
1023             integer, the line number of the entry
1024         """
1025         _BaseEntry.__init__(self, *args, **kwargs)
1026         self.comment = kwargs.get('comment', '')
1027         self.tcomment = kwargs.get('tcomment', '')
1028         self.occurrences = kwargs.get('occurrences', [])
1029         self.flags = kwargs.get('flags', [])
1030         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
1031         self.previous_msgid = kwargs.get('previous_msgid', None)
1032         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
1033         self.linenum = kwargs.get('linenum', None)
1034
1035     def __unicode__(self, wrapwidth=78):
1036         """
1037         Returns the unicode representation of the entry.
1038         """
1039         ret = []
1040         # comments first, if any (with text wrapping as xgettext does)
1041         if self.obsolete:
1042             comments = [('tcomment', '# ')]
1043         else:
1044             comments = [('comment', '#. '), ('tcomment', '# ')]
1045         for c in comments:
1046             val = getattr(self, c[0])
1047             if val:
1048                 for comment in val.split('\n'):
1049                     if len(comment) + len(c[1]) > wrapwidth > 0:
1050                         ret += textwrap.wrap(
1051                             comment,
1052                             wrapwidth,
1053                             initial_indent=c[1],
1054                             subsequent_indent=c[1],
1055                             break_long_words=False,
1056                         )
1057                     else:
1058                         ret.append(f'{c[1]}{comment}')
1059
1060         # occurrences (with text wrapping as xgettext does)
1061         if not self.obsolete and self.occurrences:
1062             filelist = []
1063             for fpath, lineno in self.occurrences:
1064                 if lineno:
1065                     filelist.append(f'{fpath}:{lineno}')
1066                 else:
1067                     filelist.append(fpath)
1068             filestr = ' '.join(filelist)
1069             if len(filestr) + 3 > wrapwidth > 0:
1070                 # textwrap split words that contain hyphen, this is not
1071                 # what we want for filenames, so the dirty hack is to
1072                 # temporally replace hyphens with a char that a file cannot
1073                 # contain, like "*"
1074                 ret += [
1075                     line.replace('*', '-')
1076                     for line in textwrap.wrap(
1077                         filestr.replace('-', '*'),
1078                         wrapwidth,
1079                         initial_indent='#: ',
1080                         subsequent_indent='#: ',
1081                         break_long_words=False,
1082                     )
1083                 ]
1084             else:
1085                 ret.append('#: ' + filestr)
1086
1087         # flags (TODO: wrapping ?)
1088         if self.flags:
1089             ret.append('#, %s' % ', '.join(self.flags))
1090
1091         # previous context and previous msgid/msgid_plural
1092         fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
1093         if self.obsolete:
1094             prefix = '#~| '
1095         else:
1096             prefix = '#| '
1097         for f in fields:
1098             val = getattr(self, f)
1099             if val is not None:
1100                 ret += self._str_field(f, prefix, '', val, wrapwidth)
1101
1102         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1103         ret = u('\n').join(ret)
1104         return ret
1105
1106     # pylint: disable=too-many-return-statements
1107     def __cmp__(self, other):
1108         """
1109         Called by comparison operations if rich comparison is not defined.
1110         """
1111         # First: Obsolete test
1112         if self.obsolete != other.obsolete:
1113             if self.obsolete:
1114                 return -1
1115             else:
1116                 return 1
1117         # Work on a copy to protect original
1118         occ1 = sorted(self.occurrences[:])
1119         occ2 = sorted(other.occurrences[:])
1120         if occ1 > occ2:
1121             return 1
1122         if occ1 < occ2:
1123             return -1
1124         # Compare context
1125         msgctxt = self.msgctxt or '0'
1126         othermsgctxt = other.msgctxt or '0'
1127         if msgctxt > othermsgctxt:
1128             return 1
1129         elif msgctxt < othermsgctxt:
1130             return -1
1131         # Compare msgid_plural
1132         msgid_plural = self.msgid_plural or '0'
1133         othermsgid_plural = other.msgid_plural or '0'
1134         if msgid_plural > othermsgid_plural:
1135             return 1
1136         elif msgid_plural < othermsgid_plural:
1137             return -1
1138         # Compare msgstr_plural
1139         if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1140             msgstr_plural = list(self.msgstr_plural.values())
1141         else:
1142             msgstr_plural = []
1143         if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1144             othermsgstr_plural = list(other.msgstr_plural.values())
1145         else:
1146             othermsgstr_plural = []
1147         if msgstr_plural > othermsgstr_plural:
1148             return 1
1149         elif msgstr_plural < othermsgstr_plural:
1150             return -1
1151         # Compare msgid
1152         if self.msgid > other.msgid:
1153             return 1
1154         elif self.msgid < other.msgid:
1155             return -1
1156         # Compare msgstr
1157         if self.msgstr > other.msgstr:
1158             return 1
1159         elif self.msgstr < other.msgstr:
1160             return -1
1161         return 0
1162
1163     def __gt__(self, other):
1164         return self.__cmp__(other) > 0
1165
1166     def __lt__(self, other):
1167         return self.__cmp__(other) < 0
1168
1169     def __ge__(self, other):
1170         return self.__cmp__(other) >= 0
1171
1172     def __le__(self, other):
1173         return self.__cmp__(other) <= 0
1174
1175     def __eq__(self, other):
1176         return self.__cmp__(other) == 0
1177
1178     def __ne__(self, other):
1179         return self.__cmp__(other) != 0
1180
1181     def translated(self):
1182         """
1183         Returns ``True`` if the entry has been translated or ``False``
1184         otherwise.
1185         """
1186         if self.obsolete or self.fuzzy:
1187             return False
1188         if self.msgstr != '':
1189             return True
1190         if self.msgstr_plural:
1191             for pos in self.msgstr_plural:
1192                 if self.msgstr_plural[pos] == '':
1193                     return False
1194             return True
1195         return False
1196
1197     def merge(self, other):
1198         """
1199         Merge the current entry with the given pot entry.
1200         """
1201         self.msgid = other.msgid
1202         self.msgctxt = other.msgctxt
1203         self.occurrences = other.occurrences
1204         self.comment = other.comment
1205         fuzzy = self.fuzzy
1206         self.flags = other.flags[:]  # clone flags
1207         if fuzzy:
1208             self.flags.append('fuzzy')
1209         self.msgid_plural = other.msgid_plural
1210         self.obsolete = other.obsolete
1211         self.previous_msgctxt = other.previous_msgctxt
1212         self.previous_msgid = other.previous_msgid
1213         self.previous_msgid_plural = other.previous_msgid_plural
1214         if other.msgstr_plural:
1215             for pos in other.msgstr_plural:
1216                 try:
1217                     # keep existing translation at pos if any
1218                     self.msgstr_plural[pos]
1219                 except KeyError:
1220                     self.msgstr_plural[pos] = ''
1221
1222     @property
1223     def fuzzy(self):
1224         return 'fuzzy' in self.flags
1225
1226     def __hash__(self):
1227         return hash((self.msgid, self.msgstr))
1228
1229
1230 # }}}
1231 # class MOEntry {{{
1232
1233
1234 class MOEntry(_BaseEntry):
1235     """
1236     Represents a mo file entry.
1237     """
1238
1239     def __init__(self, *args, **kwargs):
1240         """
1241         Constructor, accepts the following keyword arguments,
1242         for consistency with :class:`~polib.POEntry`:
1243
1244         ``comment``
1245         ``tcomment``
1246         ``occurrences``
1247         ``flags``
1248         ``previous_msgctxt``
1249         ``previous_msgid``
1250         ``previous_msgid_plural``
1251
1252         Note: even though these keyword arguments are accepted,
1253         they hold no real meaning in the context of MO files
1254         and are simply ignored.
1255         """
1256         _BaseEntry.__init__(self, *args, **kwargs)
1257         self.comment = ''
1258         self.tcomment = ''
1259         self.occurrences = []
1260         self.flags = []
1261         self.previous_msgctxt = None
1262         self.previous_msgid = None
1263         self.previous_msgid_plural = None
1264
1265     def __hash__(self):
1266         return hash((self.msgid, self.msgstr))
1267
1268
1269 # }}}
1270 # class _POFileParser {{{
1271
1272
1273 class _POFileParser:
1274     """
1275     A finite state machine to parse efficiently and correctly po
1276     file format.
1277     """
1278
1279     # pylint: disable=redefined-outer-name
1280     def __init__(self, pofile, *_args, **kwargs):
1281         """
1282         Constructor.
1283
1284         Keyword arguments:
1285
1286         ``pofile``
1287             string, path to the po file or its content
1288
1289         ``encoding``
1290             string, the encoding to use, defaults to ``default_encoding``
1291             global variable (optional).
1292
1293         ``check_for_duplicates``
1294             whether to check for duplicate entries when adding entries to the
1295             file (optional, default: ``False``).
1296         """
1297         enc = kwargs.get('encoding', default_encoding)
1298         if _is_file(pofile):
1299             try:
1300                 self.fhandle = open(pofile, encoding=enc)
1301             except LookupError:
1302                 enc = default_encoding
1303                 self.fhandle = open(pofile, encoding=enc)
1304         else:
1305             self.fhandle = pofile.splitlines()
1306
1307         klass = kwargs.get('klass')
1308         if klass is None:
1309             klass = POFile
1310         self.instance = klass(
1311             pofile=pofile,
1312             encoding=enc,
1313             check_for_duplicates=kwargs.get('check_for_duplicates', False),
1314         )
1315         self.transitions = {}
1316         self.current_line = 0
1317         self.current_entry = POEntry(linenum=self.current_line)
1318         self.current_state = 'st'
1319         self.current_token = None
1320         # two memo flags used in handlers
1321         self.msgstr_index = 0
1322         self.entry_obsolete = 0
1323         # Configure the state machine, by adding transitions.
1324         # Signification of symbols:
1325         #     * ST: Beginning of the file (start)
1326         #     * HE: Header
1327         #     * TC: a translation comment
1328         #     * GC: a generated comment
1329         #     * OC: a file/line occurrence
1330         #     * FL: a flags line
1331         #     * CT: a message context
1332         #     * PC: a previous msgctxt
1333         #     * PM: a previous msgid
1334         #     * PP: a previous msgid_plural
1335         #     * MI: a msgid
1336         #     * MP: a msgid plural
1337         #     * MS: a msgstr
1338         #     * MX: a msgstr plural
1339         #     * MC: a msgid or msgstr continuation line
1340         # pylint: disable=redefined-builtin
1341         all = [
1342             'st',
1343             'he',
1344             'gc',
1345             'oc',
1346             'fl',
1347             'ct',
1348             'pc',
1349             'pm',
1350             'pp',
1351             'tc',
1352             'ms',
1353             'mp',
1354             'mx',
1355             'mi',
1356         ]
1357
1358         self.add('tc', ['st', 'he'], 'he')
1359         self.add(
1360             'tc',
1361             ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mp', 'mx', 'mi'],
1362             'tc',
1363         )
1364         self.add('gc', all, 'gc')
1365         self.add('oc', all, 'oc')
1366         self.add('fl', all, 'fl')
1367         self.add('pc', all, 'pc')
1368         self.add('pm', all, 'pm')
1369         self.add('pp', all, 'pp')
1370         self.add(
1371             'ct',
1372             ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1373             'ct',
1374         )
1375         self.add(
1376             'mi',
1377             ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1378             'mi',
1379         )
1380         self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1381         self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1382         self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1383         self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1384
1385     # pylint: disable=too-many-branches
1386     def parse(self):
1387         """
1388         Run the state machine, parse the file line by line and call process()
1389         with the current matched symbol.
1390         """
1391
1392         keywords = {
1393             'msgctxt': 'ct',
1394             'msgid': 'mi',
1395             'msgstr': 'ms',
1396             'msgid_plural': 'mp',
1397         }
1398         prev_keywords = {
1399             'msgid_plural': 'pp',
1400             'msgid': 'pm',
1401             'msgctxt': 'pc',
1402         }
1403         tokens = []
1404         fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1405         for line in self.fhandle:
1406             self.current_line += 1
1407             if self.current_line == 1:
1408                 BOM = codecs.BOM_UTF8.decode('utf-8')
1409                 if line.startswith(BOM):
1410                     line = line[len(BOM) :]
1411             line = line.strip()
1412             if line == '':
1413                 continue
1414
1415             tokens = line.split(None, 2)
1416             nb_tokens = len(tokens)
1417
1418             if tokens[0] == '#~|':
1419                 continue
1420
1421             if tokens[0] == '#~' and nb_tokens > 1:
1422                 line = line[3:].strip()
1423                 tokens = tokens[1:]
1424                 nb_tokens -= 1
1425                 self.entry_obsolete = 1
1426             else:
1427                 self.entry_obsolete = 0
1428
1429             # Take care of keywords like
1430             # msgid, msgid_plural, msgctxt & msgstr.
1431             if tokens[0] in keywords and nb_tokens > 1:
1432                 line = line[len(tokens[0]) :].lstrip()
1433                 if re.search(r'([^\\]|^)"', line[1:-1]):
1434                     raise OSError(
1435                         'Syntax error in po file %s(line %s): '
1436                         'unescaped double quote found' % (fpath, self.current_line)
1437                     )
1438                 self.current_token = line
1439                 self.process(keywords[tokens[0]])
1440                 continue
1441
1442             self.current_token = line
1443
1444             if tokens[0] == '#:':
1445                 if nb_tokens <= 1:
1446                     continue
1447                 # we are on a occurrences line
1448                 self.process('oc')
1449
1450             elif line[:1] == '"':
1451                 # we are on a continuation line
1452                 if re.search(r'([^\\]|^)"', line[1:-1]):
1453                     raise OSError(
1454                         'Syntax error in po file %s(line %s): '
1455                         'unescaped double quote found' % (fpath, self.current_line)
1456                     )
1457                 self.process('mc')
1458
1459             elif line[:7] == 'msgstr[':
1460                 # we are on a msgstr plural
1461                 self.process('mx')
1462
1463             elif tokens[0] == '#,':
1464                 if nb_tokens <= 1:
1465                     continue
1466                 # we are on a flags line
1467                 self.process('fl')
1468
1469             elif tokens[0] == '#' or tokens[0].startswith('##'):
1470                 if line == '#':
1471                     line += ' '
1472                 # we are on a translator comment line
1473                 self.process('tc')
1474
1475             elif tokens[0] == '#.':
1476                 if nb_tokens <= 1:
1477                     continue
1478                 # we are on a generated comment line
1479                 self.process('gc')
1480
1481             elif tokens[0] == '#|':
1482                 if nb_tokens <= 1:
1483                     raise OSError(
1484                         'Syntax error in po file %s(line %s)'
1485                         % (fpath, self.current_line)
1486                     )
1487
1488                 # Remove the marker and any whitespace right after that.
1489                 line = line[2:].lstrip()
1490                 self.current_token = line
1491
1492                 if tokens[1].startswith('"'):
1493                     # Continuation of previous metadata.
1494                     self.process('mc')
1495                     continue
1496
1497                 if nb_tokens == 2:
1498                     # Invalid continuation line.
1499                     raise OSError(
1500                         'Syntax error in po file %s(line %s): '
1501                         'invalid continuation line' % (fpath, self.current_line)
1502                     )
1503
1504                 # we are on a "previous translation" comment line,
1505                 if tokens[1] not in prev_keywords:
1506                     # Unknown keyword in previous translation comment.
1507                     raise OSError(
1508                         'Syntax error in po file %s(line %s): '
1509                         'unknown keyword %s' % (fpath, self.current_line, tokens[1])
1510                     )
1511
1512                 # Remove the keyword and any whitespace
1513                 # between it and the starting quote.
1514                 line = line[len(tokens[1]) :].lstrip()
1515                 self.current_token = line
1516                 self.process(prev_keywords[tokens[1]])
1517
1518             else:
1519                 raise OSError(
1520                     f'Syntax error in po file {fpath}(line {self.current_line})'
1521                 )
1522
1523         if self.current_entry and len(tokens) > 0 and not tokens[0].startswith('#'):
1524             # since entries are added when another entry is found, we must add
1525             # the last entry here (only if there are lines). Trailing comments
1526             # are ignored
1527             self.instance.append(self.current_entry)
1528
1529         # before returning the instance, check if there's metadata and if
1530         # so extract it in a dict
1531         metadataentry = self.instance.find('')
1532         if metadataentry:  # metadata found
1533             # remove the entry
1534             self.instance.remove(metadataentry)
1535             self.instance.metadata_is_fuzzy = metadataentry.flags
1536             key = None
1537             for msg in metadataentry.msgstr.splitlines():
1538                 try:
1539                     key, val = msg.split(':', 1)
1540                     self.instance.metadata[key] = val.strip()
1541                 except (ValueError, KeyError):
1542                     if key is not None:
1543                         self.instance.metadata[key] += '\n' + msg.strip()
1544         # close opened file
1545         if not isinstance(self.fhandle, list):  # must be file
1546             self.fhandle.close()
1547         return self.instance
1548
1549     def add(self, symbol, states, next_state):
1550         """
1551         Add a transition to the state machine.
1552
1553         Keywords arguments:
1554
1555         ``symbol``
1556             string, the matched token (two chars symbol).
1557
1558         ``states``
1559             list, a list of states (two chars symbols).
1560
1561         ``next_state``
1562             the next state the fsm will have after the action.
1563         """
1564         for state in states:
1565             action = getattr(self, 'handle_%s' % next_state)
1566             self.transitions[(symbol, state)] = (action, next_state)
1567
1568     def process(self, symbol):
1569         """
1570         Process the transition corresponding to the current state and the
1571         symbol provided.
1572
1573         Keywords arguments:
1574
1575         ``symbol``
1576             string, the matched token (two chars symbol).
1577
1578         ``linenum``
1579             integer, the current line number of the parsed file.
1580         """
1581         try:
1582             (action, state) = self.transitions[(symbol, self.current_state)]
1583             if action():
1584                 self.current_state = state
1585         except Exception:
1586             fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1587             if hasattr(self.fhandle, 'close'):
1588                 self.fhandle.close()
1589             raise OSError(f'Syntax error in po file {fpath}(line {self.current_line})')
1590
1591     # state handlers
1592
1593     def handle_he(self):
1594         """Handle a header comment."""
1595         if self.instance.header != '':
1596             self.instance.header += '\n'
1597         self.instance.header += self.current_token[2:]
1598         return 1
1599
1600     def handle_tc(self):
1601         """Handle a translator comment."""
1602         if self.current_state in ['mc', 'ms', 'mx']:
1603             self.instance.append(self.current_entry)
1604             self.current_entry = POEntry(linenum=self.current_line)
1605         if self.current_entry.tcomment != '':
1606             self.current_entry.tcomment += '\n'
1607         tcomment = self.current_token.lstrip('#')
1608         if tcomment.startswith(' '):
1609             tcomment = tcomment[1:]
1610         self.current_entry.tcomment += tcomment
1611         return True
1612
1613     def handle_gc(self):
1614         """Handle a generated comment."""
1615         if self.current_state in ['mc', 'ms', 'mx']:
1616             self.instance.append(self.current_entry)
1617             self.current_entry = POEntry(linenum=self.current_line)
1618         if self.current_entry.comment != '':
1619             self.current_entry.comment += '\n'
1620         self.current_entry.comment += self.current_token[3:]
1621         return True
1622
1623     def handle_oc(self):
1624         """Handle a file:num occurrence."""
1625         if self.current_state in ['mc', 'ms', 'mx']:
1626             self.instance.append(self.current_entry)
1627             self.current_entry = POEntry(linenum=self.current_line)
1628         occurrences = self.current_token[3:].split()
1629         for occurrence in occurrences:
1630             if occurrence != '':
1631                 try:
1632                     fil, line = occurrence.rsplit(':', 1)
1633                     if not line.isdigit():
1634                         fil = occurrence
1635                         line = ''
1636                     self.current_entry.occurrences.append((fil, line))
1637                 except (ValueError, AttributeError):
1638                     self.current_entry.occurrences.append((occurrence, ''))
1639         return True
1640
1641     def handle_fl(self):
1642         """Handle a flags line."""
1643         if self.current_state in ['mc', 'ms', 'mx']:
1644             self.instance.append(self.current_entry)
1645             self.current_entry = POEntry(linenum=self.current_line)
1646         self.current_entry.flags += [
1647             c.strip() for c in self.current_token[3:].split(',')
1648         ]
1649         return True
1650
1651     def handle_pp(self):
1652         """Handle a previous msgid_plural line."""
1653         if self.current_state in ['mc', 'ms', 'mx']:
1654             self.instance.append(self.current_entry)
1655             self.current_entry = POEntry(linenum=self.current_line)
1656         self.current_entry.previous_msgid_plural = unescape(self.current_token[1:-1])
1657         return True
1658
1659     def handle_pm(self):
1660         """Handle a previous msgid line."""
1661         if self.current_state in ['mc', 'ms', 'mx']:
1662             self.instance.append(self.current_entry)
1663             self.current_entry = POEntry(linenum=self.current_line)
1664         self.current_entry.previous_msgid = unescape(self.current_token[1:-1])
1665         return True
1666
1667     def handle_pc(self):
1668         """Handle a previous msgctxt line."""
1669         if self.current_state in ['mc', 'ms', 'mx']:
1670             self.instance.append(self.current_entry)
1671             self.current_entry = POEntry(linenum=self.current_line)
1672         self.current_entry.previous_msgctxt = unescape(self.current_token[1:-1])
1673         return True
1674
1675     def handle_ct(self):
1676         """Handle a msgctxt."""
1677         if self.current_state in ['mc', 'ms', 'mx']:
1678             self.instance.append(self.current_entry)
1679             self.current_entry = POEntry(linenum=self.current_line)
1680         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1681         return True
1682
1683     def handle_mi(self):
1684         """Handle a msgid."""
1685         if self.current_state in ['mc', 'ms', 'mx']:
1686             self.instance.append(self.current_entry)
1687             self.current_entry = POEntry(linenum=self.current_line)
1688         self.current_entry.obsolete = self.entry_obsolete
1689         self.current_entry.msgid = unescape(self.current_token[1:-1])
1690         return True
1691
1692     def handle_mp(self):
1693         """Handle a msgid plural."""
1694         self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1695         return True
1696
1697     def handle_ms(self):
1698         """Handle a msgstr."""
1699         self.current_entry.msgstr = unescape(self.current_token[1:-1])
1700         return True
1701
1702     def handle_mx(self):
1703         """Handle a msgstr plural."""
1704         index = self.current_token[7]
1705         value = self.current_token[self.current_token.find('"') + 1 : -1]
1706         self.current_entry.msgstr_plural[int(index)] = unescape(value)
1707         self.msgstr_index = int(index)
1708         return True
1709
1710     def handle_mc(self):
1711         """Handle a msgid or msgstr continuation line."""
1712         token = unescape(self.current_token[1:-1])
1713         if self.current_state == 'ct':
1714             self.current_entry.msgctxt += token
1715         elif self.current_state == 'mi':
1716             self.current_entry.msgid += token
1717         elif self.current_state == 'mp':
1718             self.current_entry.msgid_plural += token
1719         elif self.current_state == 'ms':
1720             self.current_entry.msgstr += token
1721         elif self.current_state == 'mx':
1722             self.current_entry.msgstr_plural[self.msgstr_index] += token
1723         elif self.current_state == 'pp':
1724             self.current_entry.previous_msgid_plural += token
1725         elif self.current_state == 'pm':
1726             self.current_entry.previous_msgid += token
1727         elif self.current_state == 'pc':
1728             self.current_entry.previous_msgctxt += token
1729         # don't change the current state
1730         return False
1731
1732
1733 # }}}
1734 # class _MOFileParser {{{
1735
1736
1737 class _MOFileParser:
1738     """
1739     A class to parse binary mo files.
1740     """
1741
1742     # pylint: disable=unused-argument,redefined-outer-name
1743     def __init__(self, mofile, *_args, **kwargs):
1744         """
1745         Constructor.
1746
1747         Keyword arguments:
1748
1749         ``mofile``
1750             string, path to the mo file or its content
1751
1752         ``encoding``
1753             string, the encoding to use, defaults to ``default_encoding``
1754             global variable (optional).
1755
1756         ``check_for_duplicates``
1757             whether to check for duplicate entries when adding entries to the
1758             file (optional, default: ``False``).
1759         """
1760         if _is_file(mofile):
1761             self.fhandle = open(mofile, 'rb')
1762         else:
1763             self.fhandle = io.BytesIO(mofile)
1764
1765         klass = kwargs.get('klass')
1766         if klass is None:
1767             klass = MOFile
1768         self.instance = klass(
1769             fpath=mofile,
1770             encoding=kwargs.get('encoding', default_encoding),
1771             check_for_duplicates=kwargs.get('check_for_duplicates', False),
1772         )
1773
1774     def __del__(self):
1775         """
1776         Make sure the file is closed, this prevents warnings on unclosed file
1777         when running tests with python >= 3.2.
1778         """
1779         if self.fhandle and hasattr(self.fhandle, 'close'):
1780             self.fhandle.close()
1781
1782     def parse(self):
1783         """
1784         Build the instance with the file handle provided in the
1785         constructor.
1786         """
1787         # parse magic number
1788         magic_number = self._readbinary('<I', 4)
1789         if magic_number == MOFile.MAGIC:
1790             ii = '<II'
1791         elif magic_number == MOFile.MAGIC_SWAPPED:
1792             ii = '>II'
1793         else:
1794             raise OSError('Invalid mo file, magic number is incorrect !')
1795         self.instance.magic_number = magic_number
1796         # parse the version number and the number of strings
1797         version, numofstrings = self._readbinary(ii, 8)
1798         # from MO file format specs: "A program seeing an unexpected major
1799         # revision number should stop reading the MO file entirely"
1800         if version >> 16 not in (0, 1):
1801             raise OSError('Invalid mo file, unexpected major revision number')
1802         self.instance.version = version
1803         # original strings and translation strings hash table offset
1804         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1805         # move to msgid hash table and read length and offset of msgids
1806         self.fhandle.seek(msgids_hash_offset)
1807         msgids_index = []
1808         for i in range(numofstrings):
1809             msgids_index.append(self._readbinary(ii, 8))
1810         # move to msgstr hash table and read length and offset of msgstrs
1811         self.fhandle.seek(msgstrs_hash_offset)
1812         msgstrs_index = []
1813         for i in range(numofstrings):
1814             msgstrs_index.append(self._readbinary(ii, 8))
1815         # build entries
1816         encoding = self.instance.encoding
1817         for i in range(numofstrings):
1818             self.fhandle.seek(msgids_index[i][1])
1819             msgid = self.fhandle.read(msgids_index[i][0])
1820
1821             self.fhandle.seek(msgstrs_index[i][1])
1822             msgstr = self.fhandle.read(msgstrs_index[i][0])
1823             if i == 0 and not msgid:  # metadata
1824                 raw_metadata, metadata = msgstr.split(b('\n')), {}
1825                 for line in raw_metadata:
1826                     tokens = line.split(b(':'), 1)
1827                     if tokens[0] != b(''):
1828                         try:
1829                             k = tokens[0].decode(encoding)
1830                             v = tokens[1].decode(encoding)
1831                             metadata[k] = v.strip()
1832                         except IndexError:
1833                             metadata[k] = u('')
1834                 self.instance.metadata = metadata
1835                 continue
1836             # test if we have a plural entry
1837             msgid_tokens = msgid.split(b('\0'))
1838             if len(msgid_tokens) > 1:
1839                 entry = self._build_entry(
1840                     msgid=msgid_tokens[0],
1841                     msgid_plural=msgid_tokens[1],
1842                     msgstr_plural=dict(enumerate(msgstr.split(b('\x00')))),
1843                 )
1844             else:
1845                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1846             self.instance.append(entry)
1847         # close opened file
1848         self.fhandle.close()
1849         return self.instance
1850
1851     def _build_entry(self, msgid, msgstr=None, msgid_plural=None, msgstr_plural=None):
1852         msgctxt_msgid = msgid.split(b('\x04'))
1853         encoding = self.instance.encoding
1854         if len(msgctxt_msgid) > 1:
1855             kwargs = {
1856                 'msgctxt': msgctxt_msgid[0].decode(encoding),
1857                 'msgid': msgctxt_msgid[1].decode(encoding),
1858             }
1859         else:
1860             kwargs = {'msgid': msgid.decode(encoding)}
1861         if msgstr:
1862             kwargs['msgstr'] = msgstr.decode(encoding)
1863         if msgid_plural:
1864             kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1865         if msgstr_plural:
1866             for k in msgstr_plural:
1867                 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1868             kwargs['msgstr_plural'] = msgstr_plural
1869         return MOEntry(**kwargs)
1870
1871     def _readbinary(self, fmt, numbytes):
1872         """
1873         Private method that unpack n bytes of data using format <fmt>.
1874         It returns a tuple or a mixed value if the tuple length is 1.
1875         """
1876         content = self.fhandle.read(numbytes)
1877         tup = struct.unpack(fmt, content)
1878         if len(tup) == 1:
1879             return tup[0]
1880         return tup
1881
1882
1883 # }}}