cola/polib.py

   1 #
   2 # License: MIT (see extras/polib/LICENSE file provided)
   3 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
   4
   5 """
   6 **polib** allows you to manipulate, create, modify gettext files (pot, po and
   7 mo files).  You can load existing files, iterate through it's entries, add,
   8 modify entries, comments or metadata, etc. or create new po files from scratch.
   9
  10 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
  11 :func:`~polib.mofile` convenience functions.
  12 """
  13 import array
  14 import codecs
  15 import os
  16 import re
  17 import struct
  18 import sys
  19 import textwrap
  20 import io
  21
  22 from . import compat
  23
  24
  25 __author__ = 'David Jean Louis <izimobil@gmail.com>'
  26 __version__ = '1.1.1'
  27 __all__ = [
  28     'pofile',
  29     'POFile',
  30     'POEntry',
  31     'mofile',
  32     'MOFile',
  33     'MOEntry',
  34     'default_encoding',
  35     'escape',
  36     'unescape',
  37     'detect_encoding',
  38 ]
  39
  40
  41 # the default encoding to use when encoding cannot be detected
  42 default_encoding = 'utf-8'
  43
  44 # python 2/3 compatibility helpers {{{
  45
  46
  47 PY3 = True
  48 text_type = str
  49
  50
  51 def b(s):
  52     return s.encode('utf-8')
  53
  54
  55 def u(s):
  56     return s
  57
  58
  59 # }}}
  60 # _pofile_or_mofile {{{
  61
  62
  63 def _pofile_or_mofile(f, filetype, **kwargs):
  64     """
  65     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
  66     honor the DRY concept.
  67     """
  68     # get the file encoding
  69     enc = kwargs.get('encoding')
  70     if enc is None:
  71         enc = detect_encoding(f, filetype == 'mofile')
  72
  73     # parse the file
  74     kls = _POFileParser if filetype == 'pofile' else _MOFileParser
  75     parser = kls(
  76         f,
  77         encoding=enc,
  78         check_for_duplicates=kwargs.get('check_for_duplicates', False),
  79         klass=kwargs.get('klass'),
  80     )
  81     instance = parser.parse()
  82     instance.wrapwidth = kwargs.get('wrapwidth', 78)
  83     return instance
  84
  85
  86 # }}}
  87 # _is_file {{{
  88
  89
  90 def _is_file(filename_or_contents):
  91     """
  92     Safely returns the value of os.path.exists(filename_or_contents).
  93
  94     Arguments:
  95
  96     ``filename_or_contents``
  97         either a filename, or a string holding the contents of some file.
  98         In the latter case, this function will always return False.
  99     """
 100     try:
 101         return os.path.isfile(filename_or_contents)
 102     except (TypeError, ValueError, UnicodeEncodeError):
 103         return False
 104
 105
 106 # }}}
 107 # function pofile() {{{
 108
 109
 110 def pofile(pofile, **kwargs):
 111     """
 112     Convenience function that parses the po or pot file ``pofile`` and returns
 113     a :class:`~polib.POFile` instance.
 114
 115     Arguments:
 116
 117     ``pofile``
 118         string, full or relative path to the po/pot file or its content (data).
 119
 120     ``wrapwidth``
 121         integer, the wrap width, only useful when the ``-w`` option was passed
 122         to xgettext (optional, default: ``78``).
 123
 124     ``encoding``
 125         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 126         encoding will be auto-detected).
 127
 128     ``check_for_duplicates``
 129         whether to check for duplicate entries when adding entries to the
 130         file (optional, default: ``False``).
 131
 132     ``klass``
 133         class which is used to instantiate the return value (optional,
 134         default: ``None``, the return value with be a :class:`~polib.POFile`
 135         instance).
 136     """
 137     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
 138
 139
 140 # }}}
 141 # function mofile() {{{
 142
 143
 144 def mofile(mofile, **kwargs):
 145     """
 146     Convenience function that parses the mo file ``mofile`` and returns a
 147     :class:`~polib.MOFile` instance.
 148
 149     Arguments:
 150
 151     ``mofile``
 152         string, full or relative path to the mo file or its content (string
 153         or bytes).
 154
 155     ``wrapwidth``
 156         integer, the wrap width, only useful when the ``-w`` option was passed
 157         to xgettext to generate the po file that was used to format the mo file
 158         (optional, default: ``78``).
 159
 160     ``encoding``
 161         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 162         encoding will be auto-detected).
 163
 164     ``check_for_duplicates``
 165         whether to check for duplicate entries when adding entries to the
 166         file (optional, default: ``False``).
 167
 168     ``klass``
 169         class which is used to instantiate the return value (optional,
 170         default: ``None``, the return value with be a :class:`~polib.POFile`
 171         instance).
 172     """
 173     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
 174
 175
 176 # }}}
 177 # function detect_encoding() {{{
 178
 179
 180 def detect_encoding(file, binary_mode=False):
 181     """
 182     Try to detect the encoding used by the ``file``. The ``file`` argument can
 183     be a PO or MO file path or a string containing the contents of the file.
 184     If the encoding cannot be detected, the function will return the value of
 185     ``default_encoding``.
 186
 187     Arguments:
 188
 189     ``file``
 190         string, full or relative path to the po/mo file or its content.
 191
 192     ``binary_mode``
 193         boolean, set this to True if ``file`` is a mo file.
 194     """
 195     PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
 196     rxt = re.compile(u(PATTERN))
 197     rxb = re.compile(b(PATTERN))
 198
 199     def charset_exists(charset):
 200         """Check whether ``charset`` is valid or not."""
 201         try:
 202             codecs.lookup(charset)
 203         except LookupError:
 204             return False
 205         return True
 206
 207     if not _is_file(file):
 208         try:
 209             match = rxt.search(file)
 210         except TypeError:
 211             match = rxb.search(file)
 212         if match:
 213             enc = match.group(1).strip()
 214             if not isinstance(enc, text_type):
 215                 enc = enc.decode('utf-8')
 216             if charset_exists(enc):
 217                 return enc
 218     else:
 219         # For PY3, always treat as binary
 220         if binary_mode or PY3:
 221             mode = 'rb'
 222             rx = rxb
 223         else:
 224             mode = 'r'
 225             rx = rxt
 226         f = open(file, mode)
 227         for line in f.readlines():
 228             match = rx.search(line)
 229             if match:
 230                 f.close()
 231                 enc = match.group(1).strip()
 232                 if not isinstance(enc, text_type):
 233                     enc = enc.decode('utf-8')
 234                 if charset_exists(enc):
 235                     return enc
 236         f.close()
 237     return default_encoding
 238
 239
 240 # }}}
 241 # function escape() {{{
 242
 243
 244 def escape(st):
 245     """
 246     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 247     the given string ``st`` and returns it.
 248     """
 249     return (
 250         st.replace('\\', r'\\')
 251         .replace('\t', r'\t')
 252         .replace('\r', r'\r')
 253         .replace('\n', r'\n')
 254         .replace('"', r'\"')
 255     )
 256
 257
 258 # }}}
 259 # function unescape() {{{
 260
 261
 262 def unescape(st):
 263     """
 264     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 265     the given string ``st`` and returns it.
 266     """
 267
 268     def unescape_repl(m):
 269         m = m.group(1)
 270         if m == 'n':
 271             return '\n'
 272         if m == 't':
 273             return '\t'
 274         if m == 'r':
 275             return '\r'
 276         if m == '\\':
 277             return '\\'
 278         return m  # handles escaped double quote
 279
 280     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
 281
 282
 283 # }}}
 284 # function natural_sort() {{{
 285
 286
 287 def natural_sort(lst):
 288     """
 289     Sort naturally the given list.
 290     Credits: http://stackoverflow.com/a/4836734
 291     """
 292
 293     def convert(text):
 294         return int(text) if text.isdigit() else text.lower()
 295
 296     def alphanum_key(key):
 297         return [convert(c) for c in re.split('([0-9]+)', key)]
 298
 299     return sorted(lst, key=alphanum_key)
 300
 301
 302 # }}}
 303 # class _BaseFile {{{
 304
 305
 306 class _BaseFile(list):
 307     """
 308     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
 309     classes. This class should **not** be instantiated directly.
 310     """
 311
 312     def __init__(self, *_args, **kwargs):
 313         """
 314         Constructor, accepts the following keyword arguments:
 315
 316         ``pofile``
 317             string, the path to the po or mo file, or its content as a string.
 318
 319         ``wrapwidth``
 320             integer, the wrap width, only useful when the ``-w`` option was
 321             passed to xgettext (optional, default: ``78``).
 322
 323         ``encoding``
 324             string, the encoding to use, defaults to ``default_encoding``
 325             global variable (optional).
 326
 327         ``check_for_duplicates``
 328             whether to check for duplicate entries when adding entries to the
 329             file, (optional, default: ``False``).
 330         """
 331         list.__init__(self)
 332         # the opened file handle
 333         pofile = kwargs.get('pofile', None)
 334         if pofile and _is_file(pofile):
 335             self.fpath = pofile
 336         else:
 337             self.fpath = kwargs.get('fpath')
 338         # the width at which lines should be wrapped
 339         self.wrapwidth = kwargs.get('wrapwidth', 78)
 340         # the file encoding
 341         self.encoding = kwargs.get('encoding', default_encoding)
 342         # whether to check for duplicate entries or not
 343         self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
 344         # header
 345         self.header = ''
 346         # both po and mo files have metadata
 347         self.metadata = {}
 348         self.metadata_is_fuzzy = 0
 349
 350     def __unicode__(self):
 351         """
 352         Returns the unicode representation of the file.
 353         """
 354         ret = []
 355         entries = [self.metadata_as_entry()] + [e for e in self if not e.obsolete]
 356         for entry in entries:
 357             ret.append(entry.__unicode__(self.wrapwidth))
 358         for entry in self.obsolete_entries():
 359             ret.append(entry.__unicode__(self.wrapwidth))
 360         ret = u('\n').join(ret)
 361         return ret
 362
 363     if PY3:
 364
 365         def __str__(self):
 366             return self.__unicode__()
 367
 368     else:
 369
 370         def __str__(self):
 371             """
 372             Returns the string representation of the file.
 373             """
 374             return compat.ustr(self).encode(self.encoding)
 375
 376     def __contains__(self, entry):
 377         """
 378         Overridden ``list`` method to implement the membership test (in and
 379         not in).
 380         The method considers that an entry is in the file if it finds an entry
 381         that has the same msgid (the test is **case sensitive**) and the same
 382         msgctxt (or none for both entries).
 383
 384         Argument:
 385
 386         ``entry``
 387             an instance of :class:`~polib._BaseEntry`.
 388         """
 389         return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) is not None
 390
 391     def __eq__(self, other):
 392         return str(self) == str(other)
 393
 394     def __hash__(self):
 395         return hash(str(self))
 396
 397     def append(self, entry):
 398         """
 399         Overridden method to check for duplicates entries, if a user tries to
 400         add an entry that is already in the file, the method will raise a
 401         ``ValueError`` exception.
 402
 403         Argument:
 404
 405         ``entry``
 406             an instance of :class:`~polib._BaseEntry`.
 407         """
 408         # check_for_duplicates may not be defined (yet) when unpickling.
 409         # But if pickling, we never want to check for duplicates anyway.
 410         if getattr(self, 'check_for_duplicates', False) and entry in self:
 411             raise ValueError('Entry "%s" already exists' % entry.msgid)
 412         super().append(entry)
 413
 414     def insert(self, index, entry):
 415         """
 416         Overridden method to check for duplicates entries, if a user tries to
 417         add an entry that is already in the file, the method will raise a
 418         ``ValueError`` exception.
 419
 420         Arguments:
 421
 422         ``index``
 423             index at which the entry should be inserted.
 424
 425         ``entry``
 426             an instance of :class:`~polib._BaseEntry`.
 427         """
 428         if self.check_for_duplicates and entry in self:
 429             raise ValueError('Entry "%s" already exists' % entry.msgid)
 430         super().insert(index, entry)
 431
 432     def metadata_as_entry(self):
 433         """
 434         Returns the file metadata as a :class:`~polib.POFile` instance.
 435         """
 436         e = POEntry(msgid='')
 437         mdata = self.ordered_metadata()
 438         if mdata:
 439             strs = []
 440             for name, value in mdata:
 441                 # Strip whitespace off each line in a multi-line entry
 442                 strs.append(f'{name}: {value}')
 443             e.msgstr = '\n'.join(strs) + '\n'
 444         if self.metadata_is_fuzzy:
 445             e.flags.append('fuzzy')
 446         return e
 447
 448     def save(self, fpath=None, repr_method='__unicode__', newline=None):
 449         """
 450         Saves the po file to ``fpath``.
 451         If it is an existing file and no ``fpath`` is provided, then the
 452         existing file is rewritten with the modified data.
 453
 454         Keyword arguments:
 455
 456         ``fpath``
 457             string, full or relative path to the file.
 458
 459         ``repr_method``
 460             string, the method to use for output.
 461
 462         ``newline``
 463             string, controls how universal newlines works
 464         """
 465         if self.fpath is None and fpath is None:
 466             raise OSError('You must provide a file path to save() method')
 467         contents = getattr(self, repr_method)()
 468         if fpath is None:
 469             fpath = self.fpath
 470         if repr_method == 'to_binary':
 471             fhandle = open(fpath, 'wb')
 472         else:
 473             fhandle = open(fpath, 'w', encoding=self.encoding, newline=newline)
 474             if not isinstance(contents, text_type):
 475                 contents = contents.decode(self.encoding)
 476         fhandle.write(contents)
 477         fhandle.close()
 478         # set the file path if not set
 479         if self.fpath is None and fpath:
 480             self.fpath = fpath
 481
 482     def find(self, st, by='msgid', include_obsolete_entries=False, msgctxt=False):
 483         """
 484         Find the entry which msgid (or property identified by the ``by``
 485         argument) matches the string ``st``.
 486
 487         Keyword arguments:
 488
 489         ``st``
 490             string, the string to search for.
 491
 492         ``by``
 493             string, the property to use for comparison (default: ``msgid``).
 494
 495         ``include_obsolete_entries``
 496             boolean, whether to also search in entries that are obsolete.
 497
 498         ``msgctxt``
 499             string, allows specifying a specific message context for the
 500             search.
 501         """
 502         if include_obsolete_entries:
 503             entries = self[:]
 504         else:
 505             entries = [e for e in self if not e.obsolete]
 506         matches = []
 507         for e in entries:
 508             if getattr(e, by) == st:
 509                 if msgctxt is not False and e.msgctxt != msgctxt:
 510                     continue
 511                 matches.append(e)
 512         if len(matches) == 1:
 513             return matches[0]
 514         elif len(matches) > 1:
 515             if not msgctxt:
 516                 # find the entry with no msgctx
 517                 e = None
 518                 for m in matches:
 519                     if not m.msgctxt:
 520                         e = m
 521                 if e:
 522                     return e
 523                 # fallback to the first entry found
 524                 return matches[0]
 525         return None
 526
 527     def ordered_metadata(self):
 528         """
 529         Convenience method that returns an ordered version of the metadata
 530         dictionary. The return value is list of tuples (metadata name,
 531         metadata_value).
 532         """
 533         # copy the dict first
 534         metadata = self.metadata.copy()
 535         data_order = [
 536             'Project-Id-Version',
 537             'Report-Msgid-Bugs-To',
 538             'POT-Creation-Date',
 539             'PO-Revision-Date',
 540             'Last-Translator',
 541             'Language-Team',
 542             'Language',
 543             'MIME-Version',
 544             'Content-Type',
 545             'Content-Transfer-Encoding',
 546             'Plural-Forms',
 547         ]
 548         ordered_data = []
 549         for data in data_order:
 550             try:
 551                 value = metadata.pop(data)
 552                 ordered_data.append((data, value))
 553             except KeyError:
 554                 pass
 555         # the rest of the metadata will be alphabetically ordered since there
 556         # are no specs for this AFAIK
 557         for data in natural_sort(metadata.keys()):
 558             value = metadata[data]
 559             ordered_data.append((data, value))
 560         return ordered_data
 561
 562     def to_binary(self):
 563         """
 564         Return the binary representation of the file.
 565         """
 566         offsets = []
 567         entries = self.translated_entries()
 568
 569         # the keys are sorted in the .mo file
 570         def cmp(_self, other):
 571             # msgfmt compares entries with msgctxt if it exists
 572             self_msgid = _self.msgctxt or _self.msgid
 573             other_msgid = other.msgctxt or other.msgid
 574             if self_msgid > other_msgid:
 575                 return 1
 576             elif self_msgid < other_msgid:
 577                 return -1
 578             else:
 579                 return 0
 580
 581         # add metadata entry
 582         entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
 583         mentry = self.metadata_as_entry()
 584         entries = [mentry] + entries
 585         entries_len = len(entries)
 586         ids, strs = b(''), b('')
 587         for e in entries:
 588             # For each string, we need size and file offset.  Each string is
 589             # NUL terminated; the NUL does not count into the size.
 590             msgid = b('')
 591             if e.msgctxt:
 592                 # Contexts are stored by storing the concatenation of the
 593                 # context, a <EOT> byte, and the original string
 594                 msgid = self._encode(e.msgctxt + '\4')
 595             if e.msgid_plural:
 596                 msgstr = []
 597                 for index in sorted(e.msgstr_plural.keys()):
 598                     msgstr.append(e.msgstr_plural[index])
 599                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
 600                 msgstr = self._encode('\0'.join(msgstr))
 601             else:
 602                 msgid += self._encode(e.msgid)
 603                 msgstr = self._encode(e.msgstr)
 604             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
 605             ids += msgid + b('\0')
 606             strs += msgstr + b('\0')
 607
 608         # The header is 7 32-bit unsigned integers.
 609         keystart = 7 * 4 + 16 * entries_len
 610         # and the values start after the keys
 611         valuestart = keystart + len(ids)
 612         koffsets = []
 613         voffsets = []
 614         # The string table first has the list of keys, then the list of values.
 615         # Each entry has first the size of the string, then the file offset.
 616         for o1, l1, o2, l2 in offsets:
 617             koffsets += [l1, o1 + keystart]
 618             voffsets += [l2, o2 + valuestart]
 619         offsets = koffsets + voffsets
 620
 621         output = struct.pack(
 622             'Iiiiiii',
 623             # Magic number
 624             MOFile.MAGIC,
 625             # Version
 626             0,
 627             # number of entries
 628             entries_len,
 629             # start of key index
 630             7 * 4,
 631             # start of value index
 632             7 * 4 + entries_len * 8,
 633             # size and offset of hash table, we don't use hash tables
 634             0,
 635             keystart,
 636         )
 637         if PY3 and sys.version_info.minor > 1:  # python 3.2 or newer
 638             output += array.array('i', offsets).tobytes()
 639         else:
 640             output += array.array('i', offsets).tostring()
 641         output += ids
 642         output += strs
 643         return output
 644
 645     def _encode(self, mixed):
 646         """
 647         Encodes the given ``mixed`` argument with the file encoding if and
 648         only if it's an unicode string and returns the encoded string.
 649         """
 650         if isinstance(mixed, text_type):
 651             mixed = mixed.encode(self.encoding)
 652         return mixed
 653
 654
 655 # }}}
 656 # class POFile {{{
 657
 658
 659 class POFile(_BaseFile):
 660     """
 661     Po (or Pot) file reader/writer.
 662     This class inherits the :class:`~polib._BaseFile` class and, by extension,
 663     the python ``list`` type.
 664     """
 665
 666     def __unicode__(self):
 667         """
 668         Returns the unicode representation of the po file.
 669         """
 670         ret, headers = '', self.header.split('\n')
 671         for header in headers:
 672             if not header:
 673                 ret += '#\n'
 674             elif header[:1] in [',', ':']:
 675                 ret += '#%s\n' % header
 676             else:
 677                 ret += '# %s\n' % header
 678
 679         if not isinstance(ret, text_type):
 680             ret = ret.decode(self.encoding)
 681
 682         return ret + _BaseFile.__unicode__(self)
 683
 684     def save_as_mofile(self, fpath):
 685         """
 686         Saves the binary representation of the file to given ``fpath``.
 687
 688         Keyword argument:
 689
 690         ``fpath``
 691             string, full or relative path to the mo file.
 692         """
 693         _BaseFile.save(self, fpath, 'to_binary')
 694
 695     def percent_translated(self):
 696         """
 697         Convenience method that returns the percentage of translated
 698         messages.
 699         """
 700         total = len([e for e in self if not e.obsolete])
 701         if total == 0:
 702             return 100
 703         translated = len(self.translated_entries())
 704         return int(translated * 100 / float(total))
 705
 706     def translated_entries(self):
 707         """
 708         Convenience method that returns the list of translated entries.
 709         """
 710         return [e for e in self if e.translated()]
 711
 712     def untranslated_entries(self):
 713         """
 714         Convenience method that returns the list of untranslated entries.
 715         """
 716         return [
 717             e for e in self if not e.translated() and not e.obsolete and not e.fuzzy
 718         ]
 719
 720     def fuzzy_entries(self):
 721         """
 722         Convenience method that returns the list of fuzzy entries.
 723         """
 724         return [e for e in self if e.fuzzy and not e.obsolete]
 725
 726     def obsolete_entries(self):
 727         """
 728         Convenience method that returns the list of obsolete entries.
 729         """
 730         return [e for e in self if e.obsolete]
 731
 732     def merge(self, refpot):
 733         """
 734         Convenience method that merges the current pofile with the pot file
 735         provided. It behaves exactly as the gettext msgmerge utility:
 736
 737         * comments of this file will be preserved, but extracted comments and
 738           occurrences will be discarded;
 739         * any translations or comments in the file will be discarded, however,
 740           dot comments and file positions will be preserved;
 741         * the fuzzy flags are preserved.
 742
 743         Keyword argument:
 744
 745         ``refpot``
 746             object POFile, the reference catalog.
 747         """
 748         # Store entries in dict/set for faster access
 749         self_entries = {entry.msgid_with_context: entry for entry in self}
 750         refpot_msgids = {entry.msgid_with_context for entry in refpot}
 751         # Merge entries that are in the refpot
 752         for entry in refpot:
 753             e = self_entries.get(entry.msgid_with_context)
 754             if e is None:
 755                 e = POEntry()
 756                 self.append(e)
 757             e.merge(entry)
 758         # ok, now we must "obsolete" entries that are not in the refpot anymore
 759         for entry in self:
 760             if entry.msgid_with_context not in refpot_msgids:
 761                 entry.obsolete = True
 762
 763
 764 # }}}
 765 # class MOFile {{{
 766
 767
 768 class MOFile(_BaseFile):
 769     """
 770     Mo file reader/writer.
 771     This class inherits the :class:`~polib._BaseFile` class and, by
 772     extension, the python ``list`` type.
 773     """
 774
 775     MAGIC = 0x950412DE
 776     MAGIC_SWAPPED = 0xDE120495
 777
 778     def __init__(self, *args, **kwargs):
 779         """
 780         Constructor, accepts all keywords arguments accepted by
 781         :class:`~polib._BaseFile` class.
 782         """
 783         _BaseFile.__init__(self, *args, **kwargs)
 784         self.magic_number = None
 785         self.version = 0
 786
 787     def save_as_pofile(self, fpath):
 788         """
 789         Saves the mofile as a pofile to ``fpath``.
 790
 791         Keyword argument:
 792
 793         ``fpath``
 794             string, full or relative path to the file.
 795         """
 796         _BaseFile.save(self, fpath)
 797
 798     def save(self, fpath=None):
 799         """
 800         Saves the mofile to ``fpath``.
 801
 802         Keyword argument:
 803
 804         ``fpath``
 805             string, full or relative path to the file.
 806         """
 807         _BaseFile.save(self, fpath, 'to_binary')
 808
 809     def percent_translated(self):
 810         """
 811         Convenience method to keep the same interface with POFile instances.
 812         """
 813         return 100
 814
 815     def translated_entries(self):
 816         """
 817         Convenience method to keep the same interface with POFile instances.
 818         """
 819         return self
 820
 821     def untranslated_entries(self):
 822         """
 823         Convenience method to keep the same interface with POFile instances.
 824         """
 825         return []
 826
 827     def fuzzy_entries(self):
 828         """
 829         Convenience method to keep the same interface with POFile instances.
 830         """
 831         return []
 832
 833     def obsolete_entries(self):
 834         """
 835         Convenience method to keep the same interface with POFile instances.
 836         """
 837         return []
 838
 839
 840 # }}}
 841 # class _BaseEntry {{{
 842
 843
 844 class _BaseEntry:
 845     """
 846     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
 847     This class should **not** be instantiated directly.
 848     """
 849
 850     def __init__(self, *_args, **kwargs):
 851         """
 852         Constructor, accepts the following keyword arguments:
 853
 854         ``msgid``
 855             string, the entry msgid.
 856
 857         ``msgstr``
 858             string, the entry msgstr.
 859
 860         ``msgid_plural``
 861             string, the entry msgid_plural.
 862
 863         ``msgstr_plural``
 864             dict, the entry msgstr_plural lines.
 865
 866         ``msgctxt``
 867             string, the entry context (msgctxt).
 868
 869         ``obsolete``
 870             bool, whether the entry is "obsolete" or not.
 871
 872         ``encoding``
 873             string, the encoding to use, defaults to ``default_encoding``
 874             global variable (optional).
 875         """
 876         self.msgid = kwargs.get('msgid', '')
 877         self.msgstr = kwargs.get('msgstr', '')
 878         self.msgid_plural = kwargs.get('msgid_plural', '')
 879         self.msgstr_plural = kwargs.get('msgstr_plural', {})
 880         self.msgctxt = kwargs.get('msgctxt', None)
 881         self.obsolete = kwargs.get('obsolete', False)
 882         self.encoding = kwargs.get('encoding', default_encoding)
 883
 884     def __unicode__(self, wrapwidth=78):
 885         """
 886         Returns the unicode representation of the entry.
 887         """
 888         if self.obsolete:
 889             delflag = '#~ '
 890         else:
 891             delflag = ''
 892         ret = []
 893         # write the msgctxt if any
 894         if self.msgctxt is not None:
 895             ret += self._str_field('msgctxt', delflag, '', self.msgctxt, wrapwidth)
 896         # write the msgid
 897         ret += self._str_field('msgid', delflag, '', self.msgid, wrapwidth)
 898         # write the msgid_plural if any
 899         if self.msgid_plural:
 900             ret += self._str_field(
 901                 'msgid_plural', delflag, '', self.msgid_plural, wrapwidth
 902             )
 903         if self.msgstr_plural:
 904             # write the msgstr_plural if any
 905             msgstrs = self.msgstr_plural
 906             keys = list(msgstrs)
 907             keys.sort()
 908             for index in keys:
 909                 msgstr = msgstrs[index]
 910                 plural_index = '[%s]' % index
 911                 ret += self._str_field(
 912                     'msgstr', delflag, plural_index, msgstr, wrapwidth
 913                 )
 914         else:
 915             # otherwise write the msgstr
 916             ret += self._str_field('msgstr', delflag, '', self.msgstr, wrapwidth)
 917         ret.append('')
 918         ret = u('\n').join(ret)
 919         return ret
 920
 921     if PY3:
 922
 923         def __str__(self):
 924             return self.__unicode__()
 925
 926     else:
 927
 928         def __str__(self):
 929             """
 930             Returns the string representation of the entry.
 931             """
 932             return compat.ustr(self).encode(self.encoding)
 933
 934     def __eq__(self, other):
 935         return str(self) == str(other)
 936
 937     def __hash__(self):
 938         return hash(str(self))
 939
 940     def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78):
 941         lines = field.splitlines(True)
 942         if len(lines) > 1:
 943             lines = [''] + lines  # start with initial empty line
 944         else:
 945             escaped_field = escape(field)
 946             specialchars_count = 0
 947             for c in ['\\', '\n', '\r', '\t', '"']:
 948                 specialchars_count += field.count(c)
 949             # comparison must take into account fieldname length + one space
 950             # + 2 quotes (eg. msgid "<string>")
 951             flength = len(fieldname) + 3
 952             if plural_index:
 953                 flength += len(plural_index)
 954             real_wrapwidth = wrapwidth - flength + specialchars_count
 955             if wrapwidth > 0 and len(field) > real_wrapwidth:
 956                 # Wrap the line but take field name into account
 957                 lines = [''] + [
 958                     unescape(item)
 959                     for item in textwrap.wrap(
 960                         escaped_field,
 961                         wrapwidth - 2,  # 2 for quotes ""
 962                         drop_whitespace=False,
 963                         break_long_words=False,
 964                     )
 965                 ]
 966             else:
 967                 lines = [field]
 968         if fieldname.startswith('previous_'):
 969             # quick and dirty trick to get the real field name
 970             fieldname = fieldname[9:]
 971
 972         ret = [f'{delflag}{fieldname}{plural_index} "{escape(lines.pop(0))}"']
 973         for line in lines:
 974             ret.append(f'{delflag}"{escape(line)}"')
 975         return ret
 976
 977     @property
 978     def msgid_with_context(self):
 979         if self.msgctxt:
 980             return '{}{}{}'.format(self.msgctxt, '\x04', self.msgid)
 981         return self.msgid
 982
 983
 984 # }}}
 985 # class POEntry {{{
 986
 987
 988 class POEntry(_BaseEntry):
 989     """
 990     Represents a po file entry.
 991     """
 992
 993     def __init__(self, *args, **kwargs):
 994         """
 995         Constructor, accepts the following keyword arguments:
 996
 997         ``comment``
 998             string, the entry comment.
 999
1000         ``tcomment``
1001             string, the entry translator comment.
1002
1003         ``occurrences``
1004             list, the entry occurrences.
1005
1006         ``flags``
1007             list, the entry flags.
1008
1009         ``previous_msgctxt``
1010             string, the entry previous context.
1011
1012         ``previous_msgid``
1013             string, the entry previous msgid.
1014
1015         ``previous_msgid_plural``
1016             string, the entry previous msgid_plural.
1017
1018         ``linenum``
1019             integer, the line number of the entry
1020         """
1021         _BaseEntry.__init__(self, *args, **kwargs)
1022         self.comment = kwargs.get('comment', '')
1023         self.tcomment = kwargs.get('tcomment', '')
1024         self.occurrences = kwargs.get('occurrences', [])
1025         self.flags = kwargs.get('flags', [])
1026         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
1027         self.previous_msgid = kwargs.get('previous_msgid', None)
1028         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
1029         self.linenum = kwargs.get('linenum', None)
1030
1031     def __unicode__(self, wrapwidth=78):
1032         """
1033         Returns the unicode representation of the entry.
1034         """
1035         ret = []
1036         # comments first, if any (with text wrapping as xgettext does)
1037         if self.obsolete:
1038             comments = [('tcomment', '# ')]
1039         else:
1040             comments = [('comment', '#. '), ('tcomment', '# ')]
1041         for c in comments:
1042             val = getattr(self, c[0])
1043             if val:
1044                 for comment in val.split('\n'):
1045                     if len(comment) + len(c[1]) > wrapwidth > 0:
1046                         ret += textwrap.wrap(
1047                             comment,
1048                             wrapwidth,
1049                             initial_indent=c[1],
1050                             subsequent_indent=c[1],
1051                             break_long_words=False,
1052                         )
1053                     else:
1054                         ret.append(f'{c[1]}{comment}')
1055
1056         # occurrences (with text wrapping as xgettext does)
1057         if not self.obsolete and self.occurrences:
1058             filelist = []
1059             for fpath, lineno in self.occurrences:
1060                 if lineno:
1061                     filelist.append(f'{fpath}:{lineno}')
1062                 else:
1063                     filelist.append(fpath)
1064             filestr = ' '.join(filelist)
1065             if len(filestr) + 3 > wrapwidth > 0:
1066                 # textwrap split words that contain hyphen, this is not
1067                 # what we want for filenames, so the dirty hack is to
1068                 # temporally replace hyphens with a char that a file cannot
1069                 # contain, like "*"
1070                 ret += [
1071                     line.replace('*', '-')
1072                     for line in textwrap.wrap(
1073                         filestr.replace('-', '*'),
1074                         wrapwidth,
1075                         initial_indent='#: ',
1076                         subsequent_indent='#: ',
1077                         break_long_words=False,
1078                     )
1079                 ]
1080             else:
1081                 ret.append('#: ' + filestr)
1082
1083         # flags (TODO: wrapping ?)
1084         if self.flags:
1085             ret.append('#, %s' % ', '.join(self.flags))
1086
1087         # previous context and previous msgid/msgid_plural
1088         fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
1089         if self.obsolete:
1090             prefix = '#~| '
1091         else:
1092             prefix = '#| '
1093         for f in fields:
1094             val = getattr(self, f)
1095             if val is not None:
1096                 ret += self._str_field(f, prefix, '', val, wrapwidth)
1097
1098         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1099         ret = u('\n').join(ret)
1100         return ret
1101
1102     def __cmp__(self, other):
1103         """
1104         Called by comparison operations if rich comparison is not defined.
1105         """
1106         # First: Obsolete test
1107         if self.obsolete != other.obsolete:
1108             if self.obsolete:
1109                 return -1
1110             else:
1111                 return 1
1112         # Work on a copy to protect original
1113         occ1 = sorted(self.occurrences[:])
1114         occ2 = sorted(other.occurrences[:])
1115         if occ1 > occ2:
1116             return 1
1117         if occ1 < occ2:
1118             return -1
1119         # Compare context
1120         msgctxt = self.msgctxt or '0'
1121         othermsgctxt = other.msgctxt or '0'
1122         if msgctxt > othermsgctxt:
1123             return 1
1124         elif msgctxt < othermsgctxt:
1125             return -1
1126         # Compare msgid_plural
1127         msgid_plural = self.msgid_plural or '0'
1128         othermsgid_plural = other.msgid_plural or '0'
1129         if msgid_plural > othermsgid_plural:
1130             return 1
1131         elif msgid_plural < othermsgid_plural:
1132             return -1
1133         # Compare msgstr_plural
1134         if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1135             msgstr_plural = list(self.msgstr_plural.values())
1136         else:
1137             msgstr_plural = []
1138         if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1139             othermsgstr_plural = list(other.msgstr_plural.values())
1140         else:
1141             othermsgstr_plural = []
1142         if msgstr_plural > othermsgstr_plural:
1143             return 1
1144         elif msgstr_plural < othermsgstr_plural:
1145             return -1
1146         # Compare msgid
1147         if self.msgid > other.msgid:
1148             return 1
1149         elif self.msgid < other.msgid:
1150             return -1
1151         # Compare msgstr
1152         if self.msgstr > other.msgstr:
1153             return 1
1154         elif self.msgstr < other.msgstr:
1155             return -1
1156         return 0
1157
1158     def __gt__(self, other):
1159         return self.__cmp__(other) > 0
1160
1161     def __lt__(self, other):
1162         return self.__cmp__(other) < 0
1163
1164     def __ge__(self, other):
1165         return self.__cmp__(other) >= 0
1166
1167     def __le__(self, other):
1168         return self.__cmp__(other) <= 0
1169
1170     def __eq__(self, other):
1171         return self.__cmp__(other) == 0
1172
1173     def __ne__(self, other):
1174         return self.__cmp__(other) != 0
1175
1176     def translated(self):
1177         """
1178         Returns ``True`` if the entry has been translated or ``False``
1179         otherwise.
1180         """
1181         if self.obsolete or self.fuzzy:
1182             return False
1183         if self.msgstr != '':
1184             return True
1185         if self.msgstr_plural:
1186             for pos in self.msgstr_plural:
1187                 if self.msgstr_plural[pos] == '':
1188                     return False
1189             return True
1190         return False
1191
1192     def merge(self, other):
1193         """
1194         Merge the current entry with the given pot entry.
1195         """
1196         self.msgid = other.msgid
1197         self.msgctxt = other.msgctxt
1198         self.occurrences = other.occurrences
1199         self.comment = other.comment
1200         fuzzy = self.fuzzy
1201         self.flags = other.flags[:]  # clone flags
1202         if fuzzy:
1203             self.flags.append('fuzzy')
1204         self.msgid_plural = other.msgid_plural
1205         self.obsolete = other.obsolete
1206         self.previous_msgctxt = other.previous_msgctxt
1207         self.previous_msgid = other.previous_msgid
1208         self.previous_msgid_plural = other.previous_msgid_plural
1209         if other.msgstr_plural:
1210             for pos in other.msgstr_plural:
1211                 try:
1212                     # keep existing translation at pos if any
1213                     self.msgstr_plural[pos]
1214                 except KeyError:
1215                     self.msgstr_plural[pos] = ''
1216
1217     @property
1218     def fuzzy(self):
1219         return 'fuzzy' in self.flags
1220
1221     def __hash__(self):
1222         return hash((self.msgid, self.msgstr))
1223
1224
1225 # }}}
1226 # class MOEntry {{{
1227
1228
1229 class MOEntry(_BaseEntry):
1230     """
1231     Represents a mo file entry.
1232     """
1233
1234     def __init__(self, *args, **kwargs):
1235         """
1236         Constructor, accepts the following keyword arguments,
1237         for consistency with :class:`~polib.POEntry`:
1238
1239         ``comment``
1240         ``tcomment``
1241         ``occurrences``
1242         ``flags``
1243         ``previous_msgctxt``
1244         ``previous_msgid``
1245         ``previous_msgid_plural``
1246
1247         Note: even though these keyword arguments are accepted,
1248         they hold no real meaning in the context of MO files
1249         and are simply ignored.
1250         """
1251         _BaseEntry.__init__(self, *args, **kwargs)
1252         self.comment = ''
1253         self.tcomment = ''
1254         self.occurrences = []
1255         self.flags = []
1256         self.previous_msgctxt = None
1257         self.previous_msgid = None
1258         self.previous_msgid_plural = None
1259
1260     def __hash__(self):
1261         return hash((self.msgid, self.msgstr))
1262
1263
1264 # }}}
1265 # class _POFileParser {{{
1266
1267
1268 class _POFileParser:
1269     """
1270     A finite state machine to parse efficiently and correctly po
1271     file format.
1272     """
1273
1274     def __init__(self, pofile, *_args, **kwargs):
1275         """
1276         Constructor.
1277
1278         Keyword arguments:
1279
1280         ``pofile``
1281             string, path to the po file or its content
1282
1283         ``encoding``
1284             string, the encoding to use, defaults to ``default_encoding``
1285             global variable (optional).
1286
1287         ``check_for_duplicates``
1288             whether to check for duplicate entries when adding entries to the
1289             file (optional, default: ``False``).
1290         """
1291         enc = kwargs.get('encoding', default_encoding)
1292         if _is_file(pofile):
1293             try:
1294                 self.fhandle = open(pofile, encoding=enc)
1295             except LookupError:
1296                 enc = default_encoding
1297                 self.fhandle = open(pofile, encoding=enc)
1298         else:
1299             self.fhandle = pofile.splitlines()
1300
1301         klass = kwargs.get('klass')
1302         if klass is None:
1303             klass = POFile
1304         self.instance = klass(
1305             pofile=pofile,
1306             encoding=enc,
1307             check_for_duplicates=kwargs.get('check_for_duplicates', False),
1308         )
1309         self.transitions = {}
1310         self.current_line = 0
1311         self.current_entry = POEntry(linenum=self.current_line)
1312         self.current_state = 'st'
1313         self.current_token = None
1314         # two memo flags used in handlers
1315         self.msgstr_index = 0
1316         self.entry_obsolete = 0
1317         # Configure the state machine, by adding transitions.
1318         # Signification of symbols:
1319         #     * ST: Beginning of the file (start)
1320         #     * HE: Header
1321         #     * TC: a translation comment
1322         #     * GC: a generated comment
1323         #     * OC: a file/line occurrence
1324         #     * FL: a flags line
1325         #     * CT: a message context
1326         #     * PC: a previous msgctxt
1327         #     * PM: a previous msgid
1328         #     * PP: a previous msgid_plural
1329         #     * MI: a msgid
1330         #     * MP: a msgid plural
1331         #     * MS: a msgstr
1332         #     * MX: a msgstr plural
1333         #     * MC: a msgid or msgstr continuation line
1334         all = [
1335             'st',
1336             'he',
1337             'gc',
1338             'oc',
1339             'fl',
1340             'ct',
1341             'pc',
1342             'pm',
1343             'pp',
1344             'tc',
1345             'ms',
1346             'mp',
1347             'mx',
1348             'mi',
1349         ]
1350
1351         self.add('tc', ['st', 'he'], 'he')
1352         self.add(
1353             'tc',
1354             ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mp', 'mx', 'mi'],
1355             'tc',
1356         )
1357         self.add('gc', all, 'gc')
1358         self.add('oc', all, 'oc')
1359         self.add('fl', all, 'fl')
1360         self.add('pc', all, 'pc')
1361         self.add('pm', all, 'pm')
1362         self.add('pp', all, 'pp')
1363         self.add(
1364             'ct',
1365             ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1366             'ct',
1367         )
1368         self.add(
1369             'mi',
1370             ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1371             'mi',
1372         )
1373         self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1374         self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1375         self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1376         self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1377
1378     def parse(self):
1379         """
1380         Run the state machine, parse the file line by line and call process()
1381         with the current matched symbol.
1382         """
1383
1384         keywords = {
1385             'msgctxt': 'ct',
1386             'msgid': 'mi',
1387             'msgstr': 'ms',
1388             'msgid_plural': 'mp',
1389         }
1390         prev_keywords = {
1391             'msgid_plural': 'pp',
1392             'msgid': 'pm',
1393             'msgctxt': 'pc',
1394         }
1395         tokens = []
1396         fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1397         for line in self.fhandle:
1398             self.current_line += 1
1399             if self.current_line == 1:
1400                 BOM = codecs.BOM_UTF8.decode('utf-8')
1401                 if line.startswith(BOM):
1402                     line = line[len(BOM) :]
1403             line = line.strip()
1404             if line == '':
1405                 continue
1406
1407             tokens = line.split(None, 2)
1408             nb_tokens = len(tokens)
1409
1410             if tokens[0] == '#~|':
1411                 continue
1412
1413             if tokens[0] == '#~' and nb_tokens > 1:
1414                 line = line[3:].strip()
1415                 tokens = tokens[1:]
1416                 nb_tokens -= 1
1417                 self.entry_obsolete = 1
1418             else:
1419                 self.entry_obsolete = 0
1420
1421             # Take care of keywords like
1422             # msgid, msgid_plural, msgctxt & msgstr.
1423             if tokens[0] in keywords and nb_tokens > 1:
1424                 line = line[len(tokens[0]) :].lstrip()
1425                 if re.search(r'([^\\]|^)"', line[1:-1]):
1426                     raise OSError(
1427                         'Syntax error in po file %s(line %s): '
1428                         'unescaped double quote found' % (fpath, self.current_line)
1429                     )
1430                 self.current_token = line
1431                 self.process(keywords[tokens[0]])
1432                 continue
1433
1434             self.current_token = line
1435
1436             if tokens[0] == '#:':
1437                 if nb_tokens <= 1:
1438                     continue
1439                 # we are on a occurrences line
1440                 self.process('oc')
1441
1442             elif line[:1] == '"':
1443                 # we are on a continuation line
1444                 if re.search(r'([^\\]|^)"', line[1:-1]):
1445                     raise OSError(
1446                         'Syntax error in po file %s(line %s): '
1447                         'unescaped double quote found' % (fpath, self.current_line)
1448                     )
1449                 self.process('mc')
1450
1451             elif line[:7] == 'msgstr[':
1452                 # we are on a msgstr plural
1453                 self.process('mx')
1454
1455             elif tokens[0] == '#,':
1456                 if nb_tokens <= 1:
1457                     continue
1458                 # we are on a flags line
1459                 self.process('fl')
1460
1461             elif tokens[0] == '#' or tokens[0].startswith('##'):
1462                 if line == '#':
1463                     line += ' '
1464                 # we are on a translator comment line
1465                 self.process('tc')
1466
1467             elif tokens[0] == '#.':
1468                 if nb_tokens <= 1:
1469                     continue
1470                 # we are on a generated comment line
1471                 self.process('gc')
1472
1473             elif tokens[0] == '#|':
1474                 if nb_tokens <= 1:
1475                     raise OSError(
1476                         'Syntax error in po file %s(line %s)'
1477                         % (fpath, self.current_line)
1478                     )
1479
1480                 # Remove the marker and any whitespace right after that.
1481                 line = line[2:].lstrip()
1482                 self.current_token = line
1483
1484                 if tokens[1].startswith('"'):
1485                     # Continuation of previous metadata.
1486                     self.process('mc')
1487                     continue
1488
1489                 if nb_tokens == 2:
1490                     # Invalid continuation line.
1491                     raise OSError(
1492                         'Syntax error in po file %s(line %s): '
1493                         'invalid continuation line' % (fpath, self.current_line)
1494                     )
1495
1496                 # we are on a "previous translation" comment line,
1497                 if tokens[1] not in prev_keywords:
1498                     # Unknown keyword in previous translation comment.
1499                     raise OSError(
1500                         'Syntax error in po file %s(line %s): '
1501                         'unknown keyword %s' % (fpath, self.current_line, tokens[1])
1502                     )
1503
1504                 # Remove the keyword and any whitespace
1505                 # between it and the starting quote.
1506                 line = line[len(tokens[1]) :].lstrip()
1507                 self.current_token = line
1508                 self.process(prev_keywords[tokens[1]])
1509
1510             else:
1511                 raise OSError(
1512                     f'Syntax error in po file {fpath}(line {self.current_line})'
1513                 )
1514
1515         if self.current_entry and len(tokens) > 0 and not tokens[0].startswith('#'):
1516             # since entries are added when another entry is found, we must add
1517             # the last entry here (only if there are lines). Trailing comments
1518             # are ignored
1519             self.instance.append(self.current_entry)
1520
1521         # before returning the instance, check if there's metadata and if
1522         # so extract it in a dict
1523         metadataentry = self.instance.find('')
1524         if metadataentry:  # metadata found
1525             # remove the entry
1526             self.instance.remove(metadataentry)
1527             self.instance.metadata_is_fuzzy = metadataentry.flags
1528             key = None
1529             for msg in metadataentry.msgstr.splitlines():
1530                 try:
1531                     key, val = msg.split(':', 1)
1532                     self.instance.metadata[key] = val.strip()
1533                 except (ValueError, KeyError):
1534                     if key is not None:
1535                         self.instance.metadata[key] += '\n' + msg.strip()
1536         # close opened file
1537         if not isinstance(self.fhandle, list):  # must be file
1538             self.fhandle.close()
1539         return self.instance
1540
1541     def add(self, symbol, states, next_state):
1542         """
1543         Add a transition to the state machine.
1544
1545         Keywords arguments:
1546
1547         ``symbol``
1548             string, the matched token (two chars symbol).
1549
1550         ``states``
1551             list, a list of states (two chars symbols).
1552
1553         ``next_state``
1554             the next state the fsm will have after the action.
1555         """
1556         for state in states:
1557             action = getattr(self, 'handle_%s' % next_state)
1558             self.transitions[(symbol, state)] = (action, next_state)
1559
1560     def process(self, symbol):
1561         """
1562         Process the transition corresponding to the current state and the
1563         symbol provided.
1564
1565         Keywords arguments:
1566
1567         ``symbol``
1568             string, the matched token (two chars symbol).
1569
1570         ``linenum``
1571             integer, the current line number of the parsed file.
1572         """
1573         try:
1574             (action, state) = self.transitions[(symbol, self.current_state)]
1575             if action():
1576                 self.current_state = state
1577         except Exception:
1578             fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1579             if hasattr(self.fhandle, 'close'):
1580                 self.fhandle.close()
1581             raise OSError(f'Syntax error in po file {fpath}(line {self.current_line})')
1582
1583     # state handlers
1584
1585     def handle_he(self):
1586         """Handle a header comment."""
1587         if self.instance.header != '':
1588             self.instance.header += '\n'
1589         self.instance.header += self.current_token[2:]
1590         return 1
1591
1592     def handle_tc(self):
1593         """Handle a translator comment."""
1594         if self.current_state in ['mc', 'ms', 'mx']:
1595             self.instance.append(self.current_entry)
1596             self.current_entry = POEntry(linenum=self.current_line)
1597         if self.current_entry.tcomment != '':
1598             self.current_entry.tcomment += '\n'
1599         tcomment = self.current_token.lstrip('#')
1600         if tcomment.startswith(' '):
1601             tcomment = tcomment[1:]
1602         self.current_entry.tcomment += tcomment
1603         return True
1604
1605     def handle_gc(self):
1606         """Handle a generated comment."""
1607         if self.current_state in ['mc', 'ms', 'mx']:
1608             self.instance.append(self.current_entry)
1609             self.current_entry = POEntry(linenum=self.current_line)
1610         if self.current_entry.comment != '':
1611             self.current_entry.comment += '\n'
1612         self.current_entry.comment += self.current_token[3:]
1613         return True
1614
1615     def handle_oc(self):
1616         """Handle a file:num occurrence."""
1617         if self.current_state in ['mc', 'ms', 'mx']:
1618             self.instance.append(self.current_entry)
1619             self.current_entry = POEntry(linenum=self.current_line)
1620         occurrences = self.current_token[3:].split()
1621         for occurrence in occurrences:
1622             if occurrence != '':
1623                 try:
1624                     fil, line = occurrence.rsplit(':', 1)
1625                     if not line.isdigit():
1626                         fil = occurrence
1627                         line = ''
1628                     self.current_entry.occurrences.append((fil, line))
1629                 except (ValueError, AttributeError):
1630                     self.current_entry.occurrences.append((occurrence, ''))
1631         return True
1632
1633     def handle_fl(self):
1634         """Handle a flags line."""
1635         if self.current_state in ['mc', 'ms', 'mx']:
1636             self.instance.append(self.current_entry)
1637             self.current_entry = POEntry(linenum=self.current_line)
1638         self.current_entry.flags += [
1639             c.strip() for c in self.current_token[3:].split(',')
1640         ]
1641         return True
1642
1643     def handle_pp(self):
1644         """Handle a previous msgid_plural line."""
1645         if self.current_state in ['mc', 'ms', 'mx']:
1646             self.instance.append(self.current_entry)
1647             self.current_entry = POEntry(linenum=self.current_line)
1648         self.current_entry.previous_msgid_plural = unescape(self.current_token[1:-1])
1649         return True
1650
1651     def handle_pm(self):
1652         """Handle a previous msgid line."""
1653         if self.current_state in ['mc', 'ms', 'mx']:
1654             self.instance.append(self.current_entry)
1655             self.current_entry = POEntry(linenum=self.current_line)
1656         self.current_entry.previous_msgid = unescape(self.current_token[1:-1])
1657         return True
1658
1659     def handle_pc(self):
1660         """Handle a previous msgctxt line."""
1661         if self.current_state in ['mc', 'ms', 'mx']:
1662             self.instance.append(self.current_entry)
1663             self.current_entry = POEntry(linenum=self.current_line)
1664         self.current_entry.previous_msgctxt = unescape(self.current_token[1:-1])
1665         return True
1666
1667     def handle_ct(self):
1668         """Handle a msgctxt."""
1669         if self.current_state in ['mc', 'ms', 'mx']:
1670             self.instance.append(self.current_entry)
1671             self.current_entry = POEntry(linenum=self.current_line)
1672         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1673         return True
1674
1675     def handle_mi(self):
1676         """Handle a msgid."""
1677         if self.current_state in ['mc', 'ms', 'mx']:
1678             self.instance.append(self.current_entry)
1679             self.current_entry = POEntry(linenum=self.current_line)
1680         self.current_entry.obsolete = self.entry_obsolete
1681         self.current_entry.msgid = unescape(self.current_token[1:-1])
1682         return True
1683
1684     def handle_mp(self):
1685         """Handle a msgid plural."""
1686         self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1687         return True
1688
1689     def handle_ms(self):
1690         """Handle a msgstr."""
1691         self.current_entry.msgstr = unescape(self.current_token[1:-1])
1692         return True
1693
1694     def handle_mx(self):
1695         """Handle a msgstr plural."""
1696         index = self.current_token[7]
1697         value = self.current_token[self.current_token.find('"') + 1 : -1]
1698         self.current_entry.msgstr_plural[int(index)] = unescape(value)
1699         self.msgstr_index = int(index)
1700         return True
1701
1702     def handle_mc(self):
1703         """Handle a msgid or msgstr continuation line."""
1704         token = unescape(self.current_token[1:-1])
1705         if self.current_state == 'ct':
1706             self.current_entry.msgctxt += token
1707         elif self.current_state == 'mi':
1708             self.current_entry.msgid += token
1709         elif self.current_state == 'mp':
1710             self.current_entry.msgid_plural += token
1711         elif self.current_state == 'ms':
1712             self.current_entry.msgstr += token
1713         elif self.current_state == 'mx':
1714             self.current_entry.msgstr_plural[self.msgstr_index] += token
1715         elif self.current_state == 'pp':
1716             self.current_entry.previous_msgid_plural += token
1717         elif self.current_state == 'pm':
1718             self.current_entry.previous_msgid += token
1719         elif self.current_state == 'pc':
1720             self.current_entry.previous_msgctxt += token
1721         # don't change the current state
1722         return False
1723
1724
1725 # }}}
1726 # class _MOFileParser {{{
1727
1728
1729 class _MOFileParser:
1730     """
1731     A class to parse binary mo files.
1732     """
1733
1734     def __init__(self, mofile, *_args, **kwargs):
1735         """
1736         Constructor.
1737
1738         Keyword arguments:
1739
1740         ``mofile``
1741             string, path to the mo file or its content
1742
1743         ``encoding``
1744             string, the encoding to use, defaults to ``default_encoding``
1745             global variable (optional).
1746
1747         ``check_for_duplicates``
1748             whether to check for duplicate entries when adding entries to the
1749             file (optional, default: ``False``).
1750         """
1751         if _is_file(mofile):
1752             self.fhandle = open(mofile, 'rb')
1753         else:
1754             self.fhandle = io.BytesIO(mofile)
1755
1756         klass = kwargs.get('klass')
1757         if klass is None:
1758             klass = MOFile
1759         self.instance = klass(
1760             fpath=mofile,
1761             encoding=kwargs.get('encoding', default_encoding),
1762             check_for_duplicates=kwargs.get('check_for_duplicates', False),
1763         )
1764
1765     def __del__(self):
1766         """
1767         Make sure the file is closed, this prevents warnings on unclosed file
1768         when running tests with python >= 3.2.
1769         """
1770         if self.fhandle and hasattr(self.fhandle, 'close'):
1771             self.fhandle.close()
1772
1773     def parse(self):
1774         """
1775         Build the instance with the file handle provided in the
1776         constructor.
1777         """
1778         # parse magic number
1779         magic_number = self._readbinary('<I', 4)
1780         if magic_number == MOFile.MAGIC:
1781             ii = '<II'
1782         elif magic_number == MOFile.MAGIC_SWAPPED:
1783             ii = '>II'
1784         else:
1785             raise OSError('Invalid mo file, magic number is incorrect !')
1786         self.instance.magic_number = magic_number
1787         # parse the version number and the number of strings
1788         version, numofstrings = self._readbinary(ii, 8)
1789         # from MO file format specs: "A program seeing an unexpected major
1790         # revision number should stop reading the MO file entirely"
1791         if version >> 16 not in (0, 1):
1792             raise OSError('Invalid mo file, unexpected major revision number')
1793         self.instance.version = version
1794         # original strings and translation strings hash table offset
1795         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1796         # move to msgid hash table and read length and offset of msgids
1797         self.fhandle.seek(msgids_hash_offset)
1798         msgids_index = []
1799         for i in range(numofstrings):
1800             msgids_index.append(self._readbinary(ii, 8))
1801         # move to msgstr hash table and read length and offset of msgstrs
1802         self.fhandle.seek(msgstrs_hash_offset)
1803         msgstrs_index = []
1804         for i in range(numofstrings):
1805             msgstrs_index.append(self._readbinary(ii, 8))
1806         # build entries
1807         encoding = self.instance.encoding
1808         for i in range(numofstrings):
1809             self.fhandle.seek(msgids_index[i][1])
1810             msgid = self.fhandle.read(msgids_index[i][0])
1811
1812             self.fhandle.seek(msgstrs_index[i][1])
1813             msgstr = self.fhandle.read(msgstrs_index[i][0])
1814             if i == 0 and not msgid:  # metadata
1815                 raw_metadata, metadata = msgstr.split(b('\n')), {}
1816                 for line in raw_metadata:
1817                     tokens = line.split(b(':'), 1)
1818                     if tokens[0] != b(''):
1819                         try:
1820                             k = tokens[0].decode(encoding)
1821                             v = tokens[1].decode(encoding)
1822                             metadata[k] = v.strip()
1823                         except IndexError:
1824                             metadata[k] = u('')
1825                 self.instance.metadata = metadata
1826                 continue
1827             # test if we have a plural entry
1828             msgid_tokens = msgid.split(b('\0'))
1829             if len(msgid_tokens) > 1:
1830                 entry = self._build_entry(
1831                     msgid=msgid_tokens[0],
1832                     msgid_plural=msgid_tokens[1],
1833                     msgstr_plural=dict(enumerate(msgstr.split(b('\x00')))),
1834                 )
1835             else:
1836                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1837             self.instance.append(entry)
1838         # close opened file
1839         self.fhandle.close()
1840         return self.instance
1841
1842     def _build_entry(self, msgid, msgstr=None, msgid_plural=None, msgstr_plural=None):
1843         msgctxt_msgid = msgid.split(b('\x04'))
1844         encoding = self.instance.encoding
1845         if len(msgctxt_msgid) > 1:
1846             kwargs = {
1847                 'msgctxt': msgctxt_msgid[0].decode(encoding),
1848                 'msgid': msgctxt_msgid[1].decode(encoding),
1849             }
1850         else:
1851             kwargs = {'msgid': msgid.decode(encoding)}
1852         if msgstr:
1853             kwargs['msgstr'] = msgstr.decode(encoding)
1854         if msgid_plural:
1855             kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1856         if msgstr_plural:
1857             for k in msgstr_plural:
1858                 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1859             kwargs['msgstr_plural'] = msgstr_plural
1860         return MOEntry(**kwargs)
1861
1862     def _readbinary(self, fmt, numbytes):
1863         """
1864         Private method that unpack n bytes of data using format <fmt>.
1865         It returns a tuple or a mixed value if the tuple length is 1.
1866         """
1867         content = self.fhandle.read(numbytes)
1868         tup = struct.unpack(fmt, content)
1869         if len(tup) == 1:
1870             return tup[0]
1871         return tup
1872
1873
1874 # }}}