cola/polib.py

   1 # -* coding: utf-8 -*-
   2 #
   3 # License: MIT (see extras/polib/LICENSE file provided)
   4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
   5
   6 """
   7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
   8 mo files).  You can load existing files, iterate through it's entries, add,
   9 modify entries, comments or metadata, etc. or create new po files from scratch.
  10
  11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
  12 :func:`~polib.mofile` convenience functions.
  13 """
  14 from __future__ import absolute_import, division, print_function
  15 import array
  16 import codecs
  17 import os
  18 import re
  19 import struct
  20 import sys
  21 import textwrap
  22 import io
  23
  24 from . import compat
  25
  26
  27 __author__ = 'David Jean Louis <izimobil@gmail.com>'
  28 __version__ = '1.1.1'
  29 __all__ = [
  30     'pofile',
  31     'POFile',
  32     'POEntry',
  33     'mofile',
  34     'MOFile',
  35     'MOEntry',
  36     'default_encoding',
  37     'escape',
  38     'unescape',
  39     'detect_encoding',
  40 ]
  41
  42
  43 # the default encoding to use when encoding cannot be detected
  44 default_encoding = 'utf-8'
  45
  46 # python 2/3 compatibility helpers {{{
  47
  48
  49 if sys.version_info < (3,):
  50     PY3 = False
  51     text_type = compat.ustr
  52
  53     def b(s):
  54         return s
  55
  56     def u(s):
  57         return compat.ustr(s, "unicode_escape")
  58
  59
  60 else:
  61     PY3 = True
  62     text_type = str
  63
  64     def b(s):
  65         return s.encode("utf-8")
  66
  67     def u(s):
  68         return s
  69
  70
  71 # }}}
  72 # _pofile_or_mofile {{{
  73
  74
  75 def _pofile_or_mofile(f, filetype, **kwargs):
  76     """
  77     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
  78     honor the DRY concept.
  79     """
  80     # get the file encoding
  81     enc = kwargs.get('encoding')
  82     if enc is None:
  83         enc = detect_encoding(f, filetype == 'mofile')
  84
  85     # parse the file
  86     kls = _POFileParser if filetype == 'pofile' else _MOFileParser
  87     parser = kls(
  88         f,
  89         encoding=enc,
  90         check_for_duplicates=kwargs.get('check_for_duplicates', False),
  91         klass=kwargs.get('klass'),
  92     )
  93     instance = parser.parse()
  94     instance.wrapwidth = kwargs.get('wrapwidth', 78)
  95     return instance
  96
  97
  98 # }}}
  99 # _is_file {{{
 100
 101
 102 def _is_file(filename_or_contents):
 103     """
 104     Safely returns the value of os.path.exists(filename_or_contents).
 105
 106     Arguments:
 107
 108     ``filename_or_contents``
 109         either a filename, or a string holding the contents of some file.
 110         In the latter case, this function will always return False.
 111     """
 112     try:
 113         return os.path.isfile(filename_or_contents)
 114     except (TypeError, ValueError, UnicodeEncodeError):
 115         return False
 116
 117
 118 # }}}
 119 # function pofile() {{{
 120
 121
 122 # pylint: disable=redefined-outer-name
 123 def pofile(pofile, **kwargs):
 124     """
 125     Convenience function that parses the po or pot file ``pofile`` and returns
 126     a :class:`~polib.POFile` instance.
 127
 128     Arguments:
 129
 130     ``pofile``
 131         string, full or relative path to the po/pot file or its content (data).
 132
 133     ``wrapwidth``
 134         integer, the wrap width, only useful when the ``-w`` option was passed
 135         to xgettext (optional, default: ``78``).
 136
 137     ``encoding``
 138         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 139         encoding will be auto-detected).
 140
 141     ``check_for_duplicates``
 142         whether to check for duplicate entries when adding entries to the
 143         file (optional, default: ``False``).
 144
 145     ``klass``
 146         class which is used to instantiate the return value (optional,
 147         default: ``None``, the return value with be a :class:`~polib.POFile`
 148         instance).
 149     """
 150     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
 151
 152
 153 # }}}
 154 # function mofile() {{{
 155
 156
 157 # pylint: disable=redefined-outer-name
 158 def mofile(mofile, **kwargs):
 159     """
 160     Convenience function that parses the mo file ``mofile`` and returns a
 161     :class:`~polib.MOFile` instance.
 162
 163     Arguments:
 164
 165     ``mofile``
 166         string, full or relative path to the mo file or its content (string
 167         or bytes).
 168
 169     ``wrapwidth``
 170         integer, the wrap width, only useful when the ``-w`` option was passed
 171         to xgettext to generate the po file that was used to format the mo file
 172         (optional, default: ``78``).
 173
 174     ``encoding``
 175         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 176         encoding will be auto-detected).
 177
 178     ``check_for_duplicates``
 179         whether to check for duplicate entries when adding entries to the
 180         file (optional, default: ``False``).
 181
 182     ``klass``
 183         class which is used to instantiate the return value (optional,
 184         default: ``None``, the return value with be a :class:`~polib.POFile`
 185         instance).
 186     """
 187     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
 188
 189
 190 # }}}
 191 # function detect_encoding() {{{
 192
 193
 194 def detect_encoding(file, binary_mode=False):
 195     """
 196     Try to detect the encoding used by the ``file``. The ``file`` argument can
 197     be a PO or MO file path or a string containing the contents of the file.
 198     If the encoding cannot be detected, the function will return the value of
 199     ``default_encoding``.
 200
 201     Arguments:
 202
 203     ``file``
 204         string, full or relative path to the po/mo file or its content.
 205
 206     ``binary_mode``
 207         boolean, set this to True if ``file`` is a mo file.
 208     """
 209     PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
 210     rxt = re.compile(u(PATTERN))
 211     rxb = re.compile(b(PATTERN))
 212
 213     def charset_exists(charset):
 214         """Check whether ``charset`` is valid or not."""
 215         try:
 216             codecs.lookup(charset)
 217         except LookupError:
 218             return False
 219         return True
 220
 221     if not _is_file(file):
 222         try:
 223             match = rxt.search(file)
 224         except TypeError:
 225             match = rxb.search(file)
 226         if match:
 227             enc = match.group(1).strip()
 228             if not isinstance(enc, text_type):
 229                 enc = enc.decode('utf-8')
 230             if charset_exists(enc):
 231                 return enc
 232     else:
 233         # For PY3, always treat as binary
 234         if binary_mode or PY3:
 235             mode = 'rb'
 236             rx = rxb
 237         else:
 238             mode = 'r'
 239             rx = rxt
 240         f = open(file, mode)
 241         for line in f.readlines():
 242             match = rx.search(line)
 243             if match:
 244                 f.close()
 245                 enc = match.group(1).strip()
 246                 if not isinstance(enc, text_type):
 247                     enc = enc.decode('utf-8')
 248                 if charset_exists(enc):
 249                     return enc
 250         f.close()
 251     return default_encoding
 252
 253
 254 # }}}
 255 # function escape() {{{
 256
 257
 258 def escape(st):
 259     """
 260     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 261     the given string ``st`` and returns it.
 262     """
 263     return (
 264         st.replace('\\', r'\\')
 265         .replace('\t', r'\t')
 266         .replace('\r', r'\r')
 267         .replace('\n', r'\n')
 268         .replace('\"', r'\"')
 269     )
 270
 271
 272 # }}}
 273 # function unescape() {{{
 274
 275
 276 def unescape(st):
 277     """
 278     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 279     the given string ``st`` and returns it.
 280     """
 281
 282     def unescape_repl(m):
 283         m = m.group(1)
 284         if m == 'n':
 285             return '\n'
 286         if m == 't':
 287             return '\t'
 288         if m == 'r':
 289             return '\r'
 290         if m == '\\':
 291             return '\\'
 292         return m  # handles escaped double quote
 293
 294     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
 295
 296
 297 # }}}
 298 # function natural_sort() {{{
 299
 300
 301 def natural_sort(lst):
 302     """
 303     Sort naturally the given list.
 304     Credits: http://stackoverflow.com/a/4836734
 305     """
 306
 307     def convert(text):
 308         return int(text) if text.isdigit() else text.lower()
 309
 310     def alphanum_key(key):
 311         return [convert(c) for c in re.split('([0-9]+)', key)]
 312
 313     return sorted(lst, key=alphanum_key)
 314
 315
 316 # }}}
 317 # class _BaseFile {{{
 318
 319
 320 class _BaseFile(list):
 321     """
 322     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
 323     classes. This class should **not** be instantiated directly.
 324     """
 325
 326     def __init__(self, *_args, **kwargs):
 327         """
 328         Constructor, accepts the following keyword arguments:
 329
 330         ``pofile``
 331             string, the path to the po or mo file, or its content as a string.
 332
 333         ``wrapwidth``
 334             integer, the wrap width, only useful when the ``-w`` option was
 335             passed to xgettext (optional, default: ``78``).
 336
 337         ``encoding``
 338             string, the encoding to use, defaults to ``default_encoding``
 339             global variable (optional).
 340
 341         ``check_for_duplicates``
 342             whether to check for duplicate entries when adding entries to the
 343             file, (optional, default: ``False``).
 344         """
 345         list.__init__(self)
 346         # the opened file handle
 347         pofile = kwargs.get('pofile', None)  # pylint: disable=redefined-outer-name
 348         if pofile and _is_file(pofile):
 349             self.fpath = pofile
 350         else:
 351             self.fpath = kwargs.get('fpath')
 352         # the width at which lines should be wrapped
 353         self.wrapwidth = kwargs.get('wrapwidth', 78)
 354         # the file encoding
 355         self.encoding = kwargs.get('encoding', default_encoding)
 356         # whether to check for duplicate entries or not
 357         self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
 358         # header
 359         self.header = ''
 360         # both po and mo files have metadata
 361         self.metadata = {}
 362         self.metadata_is_fuzzy = 0
 363
 364     def __unicode__(self):
 365         """
 366         Returns the unicode representation of the file.
 367         """
 368         ret = []
 369         entries = [self.metadata_as_entry()] + [e for e in self if not e.obsolete]
 370         for entry in entries:
 371             ret.append(entry.__unicode__(self.wrapwidth))
 372         for entry in self.obsolete_entries():  # pylint: disable=no-member
 373             ret.append(entry.__unicode__(self.wrapwidth))
 374         ret = u('\n').join(ret)
 375         return ret
 376
 377     if PY3:
 378
 379         def __str__(self):
 380             return self.__unicode__()
 381
 382     else:
 383
 384         def __str__(self):
 385             """
 386             Returns the string representation of the file.
 387             """
 388             return compat.ustr(self).encode(self.encoding)
 389
 390     def __contains__(self, entry):
 391         """
 392         Overridden ``list`` method to implement the membership test (in and
 393         not in).
 394         The method considers that an entry is in the file if it finds an entry
 395         that has the same msgid (the test is **case sensitive**) and the same
 396         msgctxt (or none for both entries).
 397
 398         Argument:
 399
 400         ``entry``
 401             an instance of :class:`~polib._BaseEntry`.
 402         """
 403         return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) is not None
 404
 405     def __eq__(self, other):
 406         return str(self) == str(other)
 407
 408     def __hash__(self):
 409         return hash(str(self))
 410
 411     def append(self, entry):
 412         """
 413         Overridden method to check for duplicates entries, if a user tries to
 414         add an entry that is already in the file, the method will raise a
 415         ``ValueError`` exception.
 416
 417         Argument:
 418
 419         ``entry``
 420             an instance of :class:`~polib._BaseEntry`.
 421         """
 422         # check_for_duplicates may not be defined (yet) when unpickling.
 423         # But if pickling, we never want to check for duplicates anyway.
 424         if getattr(self, 'check_for_duplicates', False) and entry in self:
 425             raise ValueError('Entry "%s" already exists' % entry.msgid)
 426         super(_BaseFile, self).append(entry)
 427
 428     def insert(self, index, entry):
 429         """
 430         Overridden method to check for duplicates entries, if a user tries to
 431         add an entry that is already in the file, the method will raise a
 432         ``ValueError`` exception.
 433
 434         Arguments:
 435
 436         ``index``
 437             index at which the entry should be inserted.
 438
 439         ``entry``
 440             an instance of :class:`~polib._BaseEntry`.
 441         """
 442         if self.check_for_duplicates and entry in self:
 443             raise ValueError('Entry "%s" already exists' % entry.msgid)
 444         super(_BaseFile, self).insert(index, entry)
 445
 446     def metadata_as_entry(self):
 447         """
 448         Returns the file metadata as a :class:`~polib.POFile` instance.
 449         """
 450         e = POEntry(msgid='')
 451         mdata = self.ordered_metadata()
 452         if mdata:
 453             strs = []
 454             for name, value in mdata:
 455                 # Strip whitespace off each line in a multi-line entry
 456                 strs.append('%s: %s' % (name, value))
 457             e.msgstr = '\n'.join(strs) + '\n'
 458         if self.metadata_is_fuzzy:
 459             e.flags.append('fuzzy')
 460         return e
 461
 462     def save(self, fpath=None, repr_method='__unicode__', newline=None):
 463         """
 464         Saves the po file to ``fpath``.
 465         If it is an existing file and no ``fpath`` is provided, then the
 466         existing file is rewritten with the modified data.
 467
 468         Keyword arguments:
 469
 470         ``fpath``
 471             string, full or relative path to the file.
 472
 473         ``repr_method``
 474             string, the method to use for output.
 475
 476         ``newline``
 477             string, controls how universal newlines works
 478         """
 479         if self.fpath is None and fpath is None:
 480             raise IOError('You must provide a file path to save() method')
 481         contents = getattr(self, repr_method)()
 482         if fpath is None:
 483             fpath = self.fpath
 484         if repr_method == 'to_binary':
 485             fhandle = open(fpath, 'wb')
 486         else:
 487             fhandle = io.open(fpath, 'w', encoding=self.encoding, newline=newline)
 488             if not isinstance(contents, text_type):
 489                 contents = contents.decode(self.encoding)
 490         fhandle.write(contents)
 491         fhandle.close()
 492         # set the file path if not set
 493         if self.fpath is None and fpath:
 494             self.fpath = fpath
 495
 496     def find(self, st, by='msgid', include_obsolete_entries=False, msgctxt=False):
 497         """
 498         Find the entry which msgid (or property identified by the ``by``
 499         argument) matches the string ``st``.
 500
 501         Keyword arguments:
 502
 503         ``st``
 504             string, the string to search for.
 505
 506         ``by``
 507             string, the property to use for comparison (default: ``msgid``).
 508
 509         ``include_obsolete_entries``
 510             boolean, whether to also search in entries that are obsolete.
 511
 512         ``msgctxt``
 513             string, allows specifying a specific message context for the
 514             search.
 515         """
 516         if include_obsolete_entries:
 517             entries = self[:]
 518         else:
 519             entries = [e for e in self if not e.obsolete]
 520         matches = []
 521         for e in entries:
 522             if getattr(e, by) == st:
 523                 if msgctxt is not False and e.msgctxt != msgctxt:
 524                     continue
 525                 matches.append(e)
 526         if len(matches) == 1:
 527             return matches[0]
 528         elif len(matches) > 1:
 529             if not msgctxt:
 530                 # find the entry with no msgctx
 531                 e = None
 532                 for m in matches:
 533                     if not m.msgctxt:
 534                         e = m
 535                 if e:
 536                     return e
 537                 # fallback to the first entry found
 538                 return matches[0]
 539         return None
 540
 541     def ordered_metadata(self):
 542         """
 543         Convenience method that returns an ordered version of the metadata
 544         dictionary. The return value is list of tuples (metadata name,
 545         metadata_value).
 546         """
 547         # copy the dict first
 548         metadata = self.metadata.copy()
 549         data_order = [
 550             'Project-Id-Version',
 551             'Report-Msgid-Bugs-To',
 552             'POT-Creation-Date',
 553             'PO-Revision-Date',
 554             'Last-Translator',
 555             'Language-Team',
 556             'Language',
 557             'MIME-Version',
 558             'Content-Type',
 559             'Content-Transfer-Encoding',
 560             'Plural-Forms',
 561         ]
 562         ordered_data = []
 563         for data in data_order:
 564             try:
 565                 value = metadata.pop(data)
 566                 ordered_data.append((data, value))
 567             except KeyError:
 568                 pass
 569         # the rest of the metadata will be alphabetically ordered since there
 570         # are no specs for this AFAIK
 571         for data in natural_sort(metadata.keys()):
 572             value = metadata[data]
 573             ordered_data.append((data, value))
 574         return ordered_data
 575
 576     def to_binary(self):
 577         """
 578         Return the binary representation of the file.
 579         """
 580         offsets = []
 581         entries = self.translated_entries()  # pylint: disable=no-member
 582
 583         # the keys are sorted in the .mo file
 584         def cmp(_self, other):  # pylint: disable=unused-variable
 585             # msgfmt compares entries with msgctxt if it exists
 586             self_msgid = _self.msgctxt or _self.msgid
 587             other_msgid = other.msgctxt or other.msgid
 588             if self_msgid > other_msgid:
 589                 return 1
 590             elif self_msgid < other_msgid:
 591                 return -1
 592             else:
 593                 return 0
 594
 595         # add metadata entry
 596         entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
 597         mentry = self.metadata_as_entry()
 598         entries = [mentry] + entries
 599         entries_len = len(entries)
 600         ids, strs = b(''), b('')
 601         for e in entries:
 602             # For each string, we need size and file offset.  Each string is
 603             # NUL terminated; the NUL does not count into the size.
 604             msgid = b('')
 605             if e.msgctxt:
 606                 # Contexts are stored by storing the concatenation of the
 607                 # context, a <EOT> byte, and the original string
 608                 msgid = self._encode(e.msgctxt + '\4')
 609             if e.msgid_plural:
 610                 msgstr = []
 611                 for index in sorted(e.msgstr_plural.keys()):
 612                     msgstr.append(e.msgstr_plural[index])
 613                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
 614                 msgstr = self._encode('\0'.join(msgstr))
 615             else:
 616                 msgid += self._encode(e.msgid)
 617                 msgstr = self._encode(e.msgstr)
 618             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
 619             ids += msgid + b('\0')
 620             strs += msgstr + b('\0')
 621
 622         # The header is 7 32-bit unsigned integers.
 623         keystart = 7 * 4 + 16 * entries_len
 624         # and the values start after the keys
 625         valuestart = keystart + len(ids)
 626         koffsets = []
 627         voffsets = []
 628         # The string table first has the list of keys, then the list of values.
 629         # Each entry has first the size of the string, then the file offset.
 630         for o1, l1, o2, l2 in offsets:
 631             koffsets += [l1, o1 + keystart]
 632             voffsets += [l2, o2 + valuestart]
 633         offsets = koffsets + voffsets
 634
 635         output = struct.pack(
 636             "Iiiiiii",
 637             # Magic number
 638             MOFile.MAGIC,
 639             # Version
 640             0,
 641             # number of entries
 642             entries_len,
 643             # start of key index
 644             7 * 4,
 645             # start of value index
 646             7 * 4 + entries_len * 8,
 647             # size and offset of hash table, we don't use hash tables
 648             0,
 649             keystart,
 650         )
 651         if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
 652             output += array.array("i", offsets).tobytes()
 653         else:
 654             output += array.array("i", offsets).tostring()  # pylint: disable=no-member
 655         output += ids
 656         output += strs
 657         return output
 658
 659     def _encode(self, mixed):
 660         """
 661         Encodes the given ``mixed`` argument with the file encoding if and
 662         only if it's an unicode string and returns the encoded string.
 663         """
 664         if isinstance(mixed, text_type):
 665             mixed = mixed.encode(self.encoding)
 666         return mixed
 667
 668
 669 # }}}
 670 # class POFile {{{
 671
 672
 673 class POFile(_BaseFile):
 674     """
 675     Po (or Pot) file reader/writer.
 676     This class inherits the :class:`~polib._BaseFile` class and, by extension,
 677     the python ``list`` type.
 678     """
 679
 680     def __unicode__(self):
 681         """
 682         Returns the unicode representation of the po file.
 683         """
 684         ret, headers = '', self.header.split('\n')
 685         for header in headers:
 686             if not header:
 687                 ret += "#\n"
 688             elif header[:1] in [',', ':']:
 689                 ret += '#%s\n' % header
 690             else:
 691                 ret += '# %s\n' % header
 692
 693         if not isinstance(ret, text_type):
 694             ret = ret.decode(self.encoding)
 695
 696         return ret + _BaseFile.__unicode__(self)
 697
 698     def save_as_mofile(self, fpath):
 699         """
 700         Saves the binary representation of the file to given ``fpath``.
 701
 702         Keyword argument:
 703
 704         ``fpath``
 705             string, full or relative path to the mo file.
 706         """
 707         _BaseFile.save(self, fpath, 'to_binary')
 708
 709     def percent_translated(self):
 710         """
 711         Convenience method that returns the percentage of translated
 712         messages.
 713         """
 714         total = len([e for e in self if not e.obsolete])
 715         if total == 0:
 716             return 100
 717         translated = len(self.translated_entries())
 718         return int(translated * 100 / float(total))
 719
 720     def translated_entries(self):
 721         """
 722         Convenience method that returns the list of translated entries.
 723         """
 724         return [e for e in self if e.translated()]
 725
 726     def untranslated_entries(self):
 727         """
 728         Convenience method that returns the list of untranslated entries.
 729         """
 730         return [
 731             e for e in self if not e.translated() and not e.obsolete and not e.fuzzy
 732         ]
 733
 734     def fuzzy_entries(self):
 735         """
 736         Convenience method that returns the list of fuzzy entries.
 737         """
 738         return [e for e in self if e.fuzzy and not e.obsolete]
 739
 740     def obsolete_entries(self):
 741         """
 742         Convenience method that returns the list of obsolete entries.
 743         """
 744         return [e for e in self if e.obsolete]
 745
 746     def merge(self, refpot):
 747         """
 748         Convenience method that merges the current pofile with the pot file
 749         provided. It behaves exactly as the gettext msgmerge utility:
 750
 751         * comments of this file will be preserved, but extracted comments and
 752           occurrences will be discarded;
 753         * any translations or comments in the file will be discarded, however,
 754           dot comments and file positions will be preserved;
 755         * the fuzzy flags are preserved.
 756
 757         Keyword argument:
 758
 759         ``refpot``
 760             object POFile, the reference catalog.
 761         """
 762         # Store entries in dict/set for faster access
 763         self_entries = dict((entry.msgid_with_context, entry) for entry in self)
 764         refpot_msgids = set(entry.msgid_with_context for entry in refpot)
 765         # Merge entries that are in the refpot
 766         for entry in refpot:
 767             e = self_entries.get(entry.msgid_with_context)
 768             if e is None:
 769                 e = POEntry()
 770                 self.append(e)
 771             e.merge(entry)
 772         # ok, now we must "obsolete" entries that are not in the refpot anymore
 773         for entry in self:
 774             if entry.msgid_with_context not in refpot_msgids:
 775                 entry.obsolete = True
 776
 777
 778 # }}}
 779 # class MOFile {{{
 780
 781
 782 class MOFile(_BaseFile):
 783     """
 784     Mo file reader/writer.
 785     This class inherits the :class:`~polib._BaseFile` class and, by
 786     extension, the python ``list`` type.
 787     """
 788
 789     MAGIC = 0x950412DE
 790     MAGIC_SWAPPED = 0xDE120495
 791
 792     def __init__(self, *args, **kwargs):
 793         """
 794         Constructor, accepts all keywords arguments accepted by
 795         :class:`~polib._BaseFile` class.
 796         """
 797         _BaseFile.__init__(self, *args, **kwargs)
 798         self.magic_number = None
 799         self.version = 0
 800
 801     def save_as_pofile(self, fpath):
 802         """
 803         Saves the mofile as a pofile to ``fpath``.
 804
 805         Keyword argument:
 806
 807         ``fpath``
 808             string, full or relative path to the file.
 809         """
 810         _BaseFile.save(self, fpath)
 811
 812     # pylint: disable=no-self-use,arguments-differ
 813     def save(self, fpath=None):
 814         """
 815         Saves the mofile to ``fpath``.
 816
 817         Keyword argument:
 818
 819         ``fpath``
 820             string, full or relative path to the file.
 821         """
 822         _BaseFile.save(self, fpath, 'to_binary')
 823
 824     # pylint: disable=no-self-use
 825     def percent_translated(self):
 826         """
 827         Convenience method to keep the same interface with POFile instances.
 828         """
 829         return 100
 830
 831     # pylint: disable=no-self-use
 832     def translated_entries(self):
 833         """
 834         Convenience method to keep the same interface with POFile instances.
 835         """
 836         return self
 837
 838     # pylint: disable=no-self-use
 839     def untranslated_entries(self):
 840         """
 841         Convenience method to keep the same interface with POFile instances.
 842         """
 843         return []
 844
 845     # pylint: disable=no-self-use
 846     def fuzzy_entries(self):
 847         """
 848         Convenience method to keep the same interface with POFile instances.
 849         """
 850         return []
 851
 852     # pylint: disable=no-self-use
 853     def obsolete_entries(self):
 854         """
 855         Convenience method to keep the same interface with POFile instances.
 856         """
 857         return []
 858
 859
 860 # }}}
 861 # class _BaseEntry {{{
 862
 863
 864 class _BaseEntry(object):
 865     """
 866     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
 867     This class should **not** be instantiated directly.
 868     """
 869
 870     def __init__(self, *_args, **kwargs):
 871         """
 872         Constructor, accepts the following keyword arguments:
 873
 874         ``msgid``
 875             string, the entry msgid.
 876
 877         ``msgstr``
 878             string, the entry msgstr.
 879
 880         ``msgid_plural``
 881             string, the entry msgid_plural.
 882
 883         ``msgstr_plural``
 884             dict, the entry msgstr_plural lines.
 885
 886         ``msgctxt``
 887             string, the entry context (msgctxt).
 888
 889         ``obsolete``
 890             bool, whether the entry is "obsolete" or not.
 891
 892         ``encoding``
 893             string, the encoding to use, defaults to ``default_encoding``
 894             global variable (optional).
 895         """
 896         self.msgid = kwargs.get('msgid', '')
 897         self.msgstr = kwargs.get('msgstr', '')
 898         self.msgid_plural = kwargs.get('msgid_plural', '')
 899         self.msgstr_plural = kwargs.get('msgstr_plural', {})
 900         self.msgctxt = kwargs.get('msgctxt', None)
 901         self.obsolete = kwargs.get('obsolete', False)
 902         self.encoding = kwargs.get('encoding', default_encoding)
 903
 904     def __unicode__(self, wrapwidth=78):
 905         """
 906         Returns the unicode representation of the entry.
 907         """
 908         if self.obsolete:
 909             delflag = '#~ '
 910         else:
 911             delflag = ''
 912         ret = []
 913         # write the msgctxt if any
 914         if self.msgctxt is not None:
 915             ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth)
 916         # write the msgid
 917         ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
 918         # write the msgid_plural if any
 919         if self.msgid_plural:
 920             ret += self._str_field(
 921                 "msgid_plural", delflag, "", self.msgid_plural, wrapwidth
 922             )
 923         if self.msgstr_plural:
 924             # write the msgstr_plural if any
 925             msgstrs = self.msgstr_plural
 926             keys = list(msgstrs)
 927             keys.sort()
 928             for index in keys:
 929                 msgstr = msgstrs[index]
 930                 plural_index = '[%s]' % index
 931                 ret += self._str_field(
 932                     "msgstr", delflag, plural_index, msgstr, wrapwidth
 933                 )
 934         else:
 935             # otherwise write the msgstr
 936             ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth)
 937         ret.append('')
 938         ret = u('\n').join(ret)
 939         return ret
 940
 941     if PY3:
 942
 943         def __str__(self):
 944             return self.__unicode__()
 945
 946     else:
 947
 948         def __str__(self):
 949             """
 950             Returns the string representation of the entry.
 951             """
 952             return compat.ustr(self).encode(self.encoding)
 953
 954     def __eq__(self, other):
 955         return str(self) == str(other)
 956
 957     def __hash__(self):
 958         return hash(str(self))
 959
 960     # pylint: disable=no-self-use
 961     def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78):
 962         lines = field.splitlines(True)
 963         if len(lines) > 1:
 964             lines = [''] + lines  # start with initial empty line
 965         else:
 966             escaped_field = escape(field)
 967             specialchars_count = 0
 968             for c in ['\\', '\n', '\r', '\t', '"']:
 969                 specialchars_count += field.count(c)
 970             # comparison must take into account fieldname length + one space
 971             # + 2 quotes (eg. msgid "<string>")
 972             flength = len(fieldname) + 3
 973             if plural_index:
 974                 flength += len(plural_index)
 975             real_wrapwidth = wrapwidth - flength + specialchars_count
 976             if wrapwidth > 0 and len(field) > real_wrapwidth:
 977                 # Wrap the line but take field name into account
 978                 lines = [''] + [
 979                     unescape(item)
 980                     for item in textwrap.wrap(
 981                         escaped_field,
 982                         wrapwidth - 2,  # 2 for quotes ""
 983                         drop_whitespace=False,
 984                         break_long_words=False,
 985                     )
 986                 ]
 987             else:
 988                 lines = [field]
 989         if fieldname.startswith('previous_'):
 990             # quick and dirty trick to get the real field name
 991             fieldname = fieldname[9:]
 992
 993         ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, escape(lines.pop(0)))]
 994         for line in lines:
 995             ret.append('%s"%s"' % (delflag, escape(line)))
 996         return ret
 997
 998     @property
 999     def msgid_with_context(self):
1000         if self.msgctxt:
1001             return '%s%s%s' % (self.msgctxt, "\x04", self.msgid)
1002         return self.msgid
1003
1004
1005 # }}}
1006 # class POEntry {{{
1007
1008
1009 class POEntry(_BaseEntry):
1010     """
1011     Represents a po file entry.
1012     """
1013
1014     def __init__(self, *args, **kwargs):
1015         """
1016         Constructor, accepts the following keyword arguments:
1017
1018         ``comment``
1019             string, the entry comment.
1020
1021         ``tcomment``
1022             string, the entry translator comment.
1023
1024         ``occurrences``
1025             list, the entry occurrences.
1026
1027         ``flags``
1028             list, the entry flags.
1029
1030         ``previous_msgctxt``
1031             string, the entry previous context.
1032
1033         ``previous_msgid``
1034             string, the entry previous msgid.
1035
1036         ``previous_msgid_plural``
1037             string, the entry previous msgid_plural.
1038
1039         ``linenum``
1040             integer, the line number of the entry
1041         """
1042         _BaseEntry.__init__(self, *args, **kwargs)
1043         self.comment = kwargs.get('comment', '')
1044         self.tcomment = kwargs.get('tcomment', '')
1045         self.occurrences = kwargs.get('occurrences', [])
1046         self.flags = kwargs.get('flags', [])
1047         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
1048         self.previous_msgid = kwargs.get('previous_msgid', None)
1049         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
1050         self.linenum = kwargs.get('linenum', None)
1051
1052     def __unicode__(self, wrapwidth=78):
1053         """
1054         Returns the unicode representation of the entry.
1055         """
1056         ret = []
1057         # comments first, if any (with text wrapping as xgettext does)
1058         if self.obsolete:
1059             comments = [('tcomment', '# ')]
1060         else:
1061             comments = [('comment', '#. '), ('tcomment', '# ')]
1062         for c in comments:
1063             val = getattr(self, c[0])
1064             if val:
1065                 for comment in val.split('\n'):
1066                     if len(comment) + len(c[1]) > wrapwidth > 0:
1067                         ret += textwrap.wrap(
1068                             comment,
1069                             wrapwidth,
1070                             initial_indent=c[1],
1071                             subsequent_indent=c[1],
1072                             break_long_words=False,
1073                         )
1074                     else:
1075                         ret.append('%s%s' % (c[1], comment))
1076
1077         # occurrences (with text wrapping as xgettext does)
1078         if not self.obsolete and self.occurrences:
1079             filelist = []
1080             for fpath, lineno in self.occurrences:
1081                 if lineno:
1082                     filelist.append('%s:%s' % (fpath, lineno))
1083                 else:
1084                     filelist.append(fpath)
1085             filestr = ' '.join(filelist)
1086             if len(filestr) + 3 > wrapwidth > 0:
1087                 # textwrap split words that contain hyphen, this is not
1088                 # what we want for filenames, so the dirty hack is to
1089                 # temporally replace hyphens with a char that a file cannot
1090                 # contain, like "*"
1091                 ret += [
1092                     line.replace('*', '-')
1093                     for line in textwrap.wrap(
1094                         filestr.replace('-', '*'),
1095                         wrapwidth,
1096                         initial_indent='#: ',
1097                         subsequent_indent='#: ',
1098                         break_long_words=False,
1099                     )
1100                 ]
1101             else:
1102                 ret.append('#: ' + filestr)
1103
1104         # flags (TODO: wrapping ?)
1105         if self.flags:
1106             ret.append('#, %s' % ', '.join(self.flags))
1107
1108         # previous context and previous msgid/msgid_plural
1109         fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
1110         if self.obsolete:
1111             prefix = "#~| "
1112         else:
1113             prefix = "#| "
1114         for f in fields:
1115             val = getattr(self, f)
1116             if val is not None:
1117                 ret += self._str_field(f, prefix, "", val, wrapwidth)
1118
1119         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1120         ret = u('\n').join(ret)
1121         return ret
1122
1123     # pylint: disable=cmp-method,too-many-return-statements
1124     def __cmp__(self, other):
1125         """
1126         Called by comparison operations if rich comparison is not defined.
1127         """
1128         # First: Obsolete test
1129         if self.obsolete != other.obsolete:
1130             if self.obsolete:
1131                 return -1
1132             else:
1133                 return 1
1134         # Work on a copy to protect original
1135         occ1 = sorted(self.occurrences[:])
1136         occ2 = sorted(other.occurrences[:])
1137         if occ1 > occ2:
1138             return 1
1139         if occ1 < occ2:
1140             return -1
1141         # Compare context
1142         msgctxt = self.msgctxt or '0'
1143         othermsgctxt = other.msgctxt or '0'
1144         if msgctxt > othermsgctxt:
1145             return 1
1146         elif msgctxt < othermsgctxt:
1147             return -1
1148         # Compare msgid_plural
1149         msgid_plural = self.msgid_plural or '0'
1150         othermsgid_plural = other.msgid_plural or '0'
1151         if msgid_plural > othermsgid_plural:
1152             return 1
1153         elif msgid_plural < othermsgid_plural:
1154             return -1
1155         # Compare msgstr_plural
1156         if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1157             msgstr_plural = list(self.msgstr_plural.values())
1158         else:
1159             msgstr_plural = []
1160         if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1161             othermsgstr_plural = list(other.msgstr_plural.values())
1162         else:
1163             othermsgstr_plural = []
1164         if msgstr_plural > othermsgstr_plural:
1165             return 1
1166         elif msgstr_plural < othermsgstr_plural:
1167             return -1
1168         # Compare msgid
1169         if self.msgid > other.msgid:
1170             return 1
1171         elif self.msgid < other.msgid:
1172             return -1
1173         # Compare msgstr
1174         if self.msgstr > other.msgstr:
1175             return 1
1176         elif self.msgstr < other.msgstr:
1177             return -1
1178         return 0
1179
1180     def __gt__(self, other):
1181         return self.__cmp__(other) > 0
1182
1183     def __lt__(self, other):
1184         return self.__cmp__(other) < 0
1185
1186     def __ge__(self, other):
1187         return self.__cmp__(other) >= 0
1188
1189     def __le__(self, other):
1190         return self.__cmp__(other) <= 0
1191
1192     def __eq__(self, other):
1193         return self.__cmp__(other) == 0
1194
1195     def __ne__(self, other):
1196         return self.__cmp__(other) != 0
1197
1198     def translated(self):
1199         """
1200         Returns ``True`` if the entry has been translated or ``False``
1201         otherwise.
1202         """
1203         if self.obsolete or self.fuzzy:
1204             return False
1205         if self.msgstr != '':
1206             return True
1207         if self.msgstr_plural:
1208             for pos in self.msgstr_plural:
1209                 if self.msgstr_plural[pos] == '':
1210                     return False
1211             return True
1212         return False
1213
1214     def merge(self, other):
1215         """
1216         Merge the current entry with the given pot entry.
1217         """
1218         self.msgid = other.msgid
1219         self.msgctxt = other.msgctxt
1220         self.occurrences = other.occurrences
1221         self.comment = other.comment
1222         fuzzy = self.fuzzy
1223         self.flags = other.flags[:]  # clone flags
1224         if fuzzy:
1225             self.flags.append('fuzzy')
1226         self.msgid_plural = other.msgid_plural
1227         self.obsolete = other.obsolete
1228         self.previous_msgctxt = other.previous_msgctxt
1229         self.previous_msgid = other.previous_msgid
1230         self.previous_msgid_plural = other.previous_msgid_plural
1231         if other.msgstr_plural:
1232             for pos in other.msgstr_plural:
1233                 try:
1234                     # keep existing translation at pos if any
1235                     self.msgstr_plural[pos]
1236                 except KeyError:
1237                     self.msgstr_plural[pos] = ''
1238
1239     @property
1240     def fuzzy(self):
1241         return 'fuzzy' in self.flags
1242
1243     def __hash__(self):
1244         return hash((self.msgid, self.msgstr))
1245
1246
1247 # }}}
1248 # class MOEntry {{{
1249
1250
1251 class MOEntry(_BaseEntry):
1252     """
1253     Represents a mo file entry.
1254     """
1255
1256     def __init__(self, *args, **kwargs):
1257         """
1258         Constructor, accepts the following keyword arguments,
1259         for consistency with :class:`~polib.POEntry`:
1260
1261         ``comment``
1262         ``tcomment``
1263         ``occurrences``
1264         ``flags``
1265         ``previous_msgctxt``
1266         ``previous_msgid``
1267         ``previous_msgid_plural``
1268
1269         Note: even though these keyword arguments are accepted,
1270         they hold no real meaning in the context of MO files
1271         and are simply ignored.
1272         """
1273         _BaseEntry.__init__(self, *args, **kwargs)
1274         self.comment = ''
1275         self.tcomment = ''
1276         self.occurrences = []
1277         self.flags = []
1278         self.previous_msgctxt = None
1279         self.previous_msgid = None
1280         self.previous_msgid_plural = None
1281
1282     def __hash__(self):
1283         return hash((self.msgid, self.msgstr))
1284
1285
1286 # }}}
1287 # class _POFileParser {{{
1288
1289
1290 class _POFileParser(object):
1291     """
1292     A finite state machine to parse efficiently and correctly po
1293     file format.
1294     """
1295
1296     # pylint: disable=redefined-outer-name
1297     def __init__(self, pofile, *_args, **kwargs):
1298         """
1299         Constructor.
1300
1301         Keyword arguments:
1302
1303         ``pofile``
1304             string, path to the po file or its content
1305
1306         ``encoding``
1307             string, the encoding to use, defaults to ``default_encoding``
1308             global variable (optional).
1309
1310         ``check_for_duplicates``
1311             whether to check for duplicate entries when adding entries to the
1312             file (optional, default: ``False``).
1313         """
1314         enc = kwargs.get('encoding', default_encoding)
1315         if _is_file(pofile):
1316             try:
1317                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1318             except LookupError:
1319                 enc = default_encoding
1320                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1321         else:
1322             self.fhandle = pofile.splitlines()
1323
1324         klass = kwargs.get('klass')
1325         if klass is None:
1326             klass = POFile
1327         self.instance = klass(
1328             pofile=pofile,
1329             encoding=enc,
1330             check_for_duplicates=kwargs.get('check_for_duplicates', False),
1331         )
1332         self.transitions = {}
1333         self.current_line = 0
1334         self.current_entry = POEntry(linenum=self.current_line)
1335         self.current_state = 'st'
1336         self.current_token = None
1337         # two memo flags used in handlers
1338         self.msgstr_index = 0
1339         self.entry_obsolete = 0
1340         # Configure the state machine, by adding transitions.
1341         # Signification of symbols:
1342         #     * ST: Beginning of the file (start)
1343         #     * HE: Header
1344         #     * TC: a translation comment
1345         #     * GC: a generated comment
1346         #     * OC: a file/line occurrence
1347         #     * FL: a flags line
1348         #     * CT: a message context
1349         #     * PC: a previous msgctxt
1350         #     * PM: a previous msgid
1351         #     * PP: a previous msgid_plural
1352         #     * MI: a msgid
1353         #     * MP: a msgid plural
1354         #     * MS: a msgstr
1355         #     * MX: a msgstr plural
1356         #     * MC: a msgid or msgstr continuation line
1357         # pylint: disable=redefined-builtin
1358         all = [
1359             'st',
1360             'he',
1361             'gc',
1362             'oc',
1363             'fl',
1364             'ct',
1365             'pc',
1366             'pm',
1367             'pp',
1368             'tc',
1369             'ms',
1370             'mp',
1371             'mx',
1372             'mi',
1373         ]
1374
1375         self.add('tc', ['st', 'he'], 'he')
1376         self.add(
1377             'tc',
1378             ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mp', 'mx', 'mi'],
1379             'tc',
1380         )
1381         self.add('gc', all, 'gc')
1382         self.add('oc', all, 'oc')
1383         self.add('fl', all, 'fl')
1384         self.add('pc', all, 'pc')
1385         self.add('pm', all, 'pm')
1386         self.add('pp', all, 'pp')
1387         self.add(
1388             'ct',
1389             ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1390             'ct',
1391         )
1392         self.add(
1393             'mi',
1394             ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1395             'mi',
1396         )
1397         self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1398         self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1399         self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1400         self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1401
1402     # pylint: disable=too-many-branches
1403     def parse(self):
1404         """
1405         Run the state machine, parse the file line by line and call process()
1406         with the current matched symbol.
1407         """
1408
1409         keywords = {
1410             'msgctxt': 'ct',
1411             'msgid': 'mi',
1412             'msgstr': 'ms',
1413             'msgid_plural': 'mp',
1414         }
1415         prev_keywords = {
1416             'msgid_plural': 'pp',
1417             'msgid': 'pm',
1418             'msgctxt': 'pc',
1419         }
1420         tokens = []
1421         fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1422         for line in self.fhandle:
1423             self.current_line += 1
1424             if self.current_line == 1:
1425                 BOM = codecs.BOM_UTF8.decode('utf-8')
1426                 if line.startswith(BOM):
1427                     line = line[len(BOM) :]
1428             line = line.strip()
1429             if line == '':
1430                 continue
1431
1432             tokens = line.split(None, 2)
1433             nb_tokens = len(tokens)
1434
1435             if tokens[0] == '#~|':
1436                 continue
1437
1438             if tokens[0] == '#~' and nb_tokens > 1:
1439                 line = line[3:].strip()
1440                 tokens = tokens[1:]
1441                 nb_tokens -= 1
1442                 self.entry_obsolete = 1
1443             else:
1444                 self.entry_obsolete = 0
1445
1446             # Take care of keywords like
1447             # msgid, msgid_plural, msgctxt & msgstr.
1448             if tokens[0] in keywords and nb_tokens > 1:
1449                 line = line[len(tokens[0]) :].lstrip()
1450                 if re.search(r'([^\\]|^)"', line[1:-1]):
1451                     raise IOError(
1452                         'Syntax error in po file %s(line %s): '
1453                         'unescaped double quote found' % (fpath, self.current_line)
1454                     )
1455                 self.current_token = line
1456                 self.process(keywords[tokens[0]])
1457                 continue
1458
1459             self.current_token = line
1460
1461             if tokens[0] == '#:':
1462                 if nb_tokens <= 1:
1463                     continue
1464                 # we are on a occurrences line
1465                 self.process('oc')
1466
1467             elif line[:1] == '"':
1468                 # we are on a continuation line
1469                 if re.search(r'([^\\]|^)"', line[1:-1]):
1470                     raise IOError(
1471                         'Syntax error in po file %s(line %s): '
1472                         'unescaped double quote found' % (fpath, self.current_line)
1473                     )
1474                 self.process('mc')
1475
1476             elif line[:7] == 'msgstr[':
1477                 # we are on a msgstr plural
1478                 self.process('mx')
1479
1480             elif tokens[0] == '#,':
1481                 if nb_tokens <= 1:
1482                     continue
1483                 # we are on a flags line
1484                 self.process('fl')
1485
1486             elif tokens[0] == '#' or tokens[0].startswith('##'):
1487                 if line == '#':
1488                     line += ' '
1489                 # we are on a translator comment line
1490                 self.process('tc')
1491
1492             elif tokens[0] == '#.':
1493                 if nb_tokens <= 1:
1494                     continue
1495                 # we are on a generated comment line
1496                 self.process('gc')
1497
1498             elif tokens[0] == '#|':
1499                 if nb_tokens <= 1:
1500                     raise IOError(
1501                         'Syntax error in po file %s(line %s)'
1502                         % (fpath, self.current_line)
1503                     )
1504
1505                 # Remove the marker and any whitespace right after that.
1506                 line = line[2:].lstrip()
1507                 self.current_token = line
1508
1509                 if tokens[1].startswith('"'):
1510                     # Continuation of previous metadata.
1511                     self.process('mc')
1512                     continue
1513
1514                 if nb_tokens == 2:
1515                     # Invalid continuation line.
1516                     raise IOError(
1517                         'Syntax error in po file %s(line %s): '
1518                         'invalid continuation line' % (fpath, self.current_line)
1519                     )
1520
1521                 # we are on a "previous translation" comment line,
1522                 if tokens[1] not in prev_keywords:
1523                     # Unknown keyword in previous translation comment.
1524                     raise IOError(
1525                         'Syntax error in po file %s(line %s): '
1526                         'unknown keyword %s' % (fpath, self.current_line, tokens[1])
1527                     )
1528
1529                 # Remove the keyword and any whitespace
1530                 # between it and the starting quote.
1531                 line = line[len(tokens[1]) :].lstrip()
1532                 self.current_token = line
1533                 self.process(prev_keywords[tokens[1]])
1534
1535             else:
1536                 raise IOError(
1537                     'Syntax error in po file %s(line %s)' % (fpath, self.current_line)
1538                 )
1539
1540         if self.current_entry and len(tokens) > 0 and not tokens[0].startswith('#'):
1541             # since entries are added when another entry is found, we must add
1542             # the last entry here (only if there are lines). Trailing comments
1543             # are ignored
1544             self.instance.append(self.current_entry)
1545
1546         # before returning the instance, check if there's metadata and if
1547         # so extract it in a dict
1548         metadataentry = self.instance.find('')
1549         if metadataentry:  # metadata found
1550             # remove the entry
1551             self.instance.remove(metadataentry)
1552             self.instance.metadata_is_fuzzy = metadataentry.flags
1553             key = None
1554             for msg in metadataentry.msgstr.splitlines():
1555                 try:
1556                     key, val = msg.split(':', 1)
1557                     self.instance.metadata[key] = val.strip()
1558                 except (ValueError, KeyError):
1559                     if key is not None:
1560                         self.instance.metadata[key] += '\n' + msg.strip()
1561         # close opened file
1562         if not isinstance(self.fhandle, list):  # must be file
1563             self.fhandle.close()
1564         return self.instance
1565
1566     def add(self, symbol, states, next_state):
1567         """
1568         Add a transition to the state machine.
1569
1570         Keywords arguments:
1571
1572         ``symbol``
1573             string, the matched token (two chars symbol).
1574
1575         ``states``
1576             list, a list of states (two chars symbols).
1577
1578         ``next_state``
1579             the next state the fsm will have after the action.
1580         """
1581         for state in states:
1582             action = getattr(self, 'handle_%s' % next_state)
1583             self.transitions[(symbol, state)] = (action, next_state)
1584
1585     def process(self, symbol):
1586         """
1587         Process the transition corresponding to the current state and the
1588         symbol provided.
1589
1590         Keywords arguments:
1591
1592         ``symbol``
1593             string, the matched token (two chars symbol).
1594
1595         ``linenum``
1596             integer, the current line number of the parsed file.
1597         """
1598         try:
1599             (action, state) = self.transitions[(symbol, self.current_state)]
1600             if action():
1601                 self.current_state = state
1602         except Exception:
1603             fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1604             if hasattr(self.fhandle, 'close'):
1605                 self.fhandle.close()
1606             raise IOError(
1607                 'Syntax error in po file %s(line %s)' % (fpath, self.current_line)
1608             )
1609
1610     # state handlers
1611
1612     def handle_he(self):
1613         """Handle a header comment."""
1614         if self.instance.header != '':
1615             self.instance.header += '\n'
1616         self.instance.header += self.current_token[2:]
1617         return 1
1618
1619     def handle_tc(self):
1620         """Handle a translator comment."""
1621         if self.current_state in ['mc', 'ms', 'mx']:
1622             self.instance.append(self.current_entry)
1623             self.current_entry = POEntry(linenum=self.current_line)
1624         if self.current_entry.tcomment != '':
1625             self.current_entry.tcomment += '\n'
1626         tcomment = self.current_token.lstrip('#')
1627         if tcomment.startswith(' '):
1628             tcomment = tcomment[1:]
1629         self.current_entry.tcomment += tcomment
1630         return True
1631
1632     def handle_gc(self):
1633         """Handle a generated comment."""
1634         if self.current_state in ['mc', 'ms', 'mx']:
1635             self.instance.append(self.current_entry)
1636             self.current_entry = POEntry(linenum=self.current_line)
1637         if self.current_entry.comment != '':
1638             self.current_entry.comment += '\n'
1639         self.current_entry.comment += self.current_token[3:]
1640         return True
1641
1642     def handle_oc(self):
1643         """Handle a file:num occurrence."""
1644         if self.current_state in ['mc', 'ms', 'mx']:
1645             self.instance.append(self.current_entry)
1646             self.current_entry = POEntry(linenum=self.current_line)
1647         occurrences = self.current_token[3:].split()
1648         for occurrence in occurrences:
1649             if occurrence != '':
1650                 try:
1651                     fil, line = occurrence.rsplit(':', 1)
1652                     if not line.isdigit():
1653                         fil = occurrence
1654                         line = ''
1655                     self.current_entry.occurrences.append((fil, line))
1656                 except (ValueError, AttributeError):
1657                     self.current_entry.occurrences.append((occurrence, ''))
1658         return True
1659
1660     def handle_fl(self):
1661         """Handle a flags line."""
1662         if self.current_state in ['mc', 'ms', 'mx']:
1663             self.instance.append(self.current_entry)
1664             self.current_entry = POEntry(linenum=self.current_line)
1665         self.current_entry.flags += [
1666             c.strip() for c in self.current_token[3:].split(',')
1667         ]
1668         return True
1669
1670     def handle_pp(self):
1671         """Handle a previous msgid_plural line."""
1672         if self.current_state in ['mc', 'ms', 'mx']:
1673             self.instance.append(self.current_entry)
1674             self.current_entry = POEntry(linenum=self.current_line)
1675         self.current_entry.previous_msgid_plural = unescape(self.current_token[1:-1])
1676         return True
1677
1678     def handle_pm(self):
1679         """Handle a previous msgid line."""
1680         if self.current_state in ['mc', 'ms', 'mx']:
1681             self.instance.append(self.current_entry)
1682             self.current_entry = POEntry(linenum=self.current_line)
1683         self.current_entry.previous_msgid = unescape(self.current_token[1:-1])
1684         return True
1685
1686     def handle_pc(self):
1687         """Handle a previous msgctxt line."""
1688         if self.current_state in ['mc', 'ms', 'mx']:
1689             self.instance.append(self.current_entry)
1690             self.current_entry = POEntry(linenum=self.current_line)
1691         self.current_entry.previous_msgctxt = unescape(self.current_token[1:-1])
1692         return True
1693
1694     def handle_ct(self):
1695         """Handle a msgctxt."""
1696         if self.current_state in ['mc', 'ms', 'mx']:
1697             self.instance.append(self.current_entry)
1698             self.current_entry = POEntry(linenum=self.current_line)
1699         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1700         return True
1701
1702     def handle_mi(self):
1703         """Handle a msgid."""
1704         if self.current_state in ['mc', 'ms', 'mx']:
1705             self.instance.append(self.current_entry)
1706             self.current_entry = POEntry(linenum=self.current_line)
1707         self.current_entry.obsolete = self.entry_obsolete
1708         self.current_entry.msgid = unescape(self.current_token[1:-1])
1709         return True
1710
1711     def handle_mp(self):
1712         """Handle a msgid plural."""
1713         self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1714         return True
1715
1716     def handle_ms(self):
1717         """Handle a msgstr."""
1718         self.current_entry.msgstr = unescape(self.current_token[1:-1])
1719         return True
1720
1721     def handle_mx(self):
1722         """Handle a msgstr plural."""
1723         index = self.current_token[7]
1724         value = self.current_token[self.current_token.find('"') + 1 : -1]
1725         self.current_entry.msgstr_plural[int(index)] = unescape(value)
1726         self.msgstr_index = int(index)
1727         return True
1728
1729     def handle_mc(self):
1730         """Handle a msgid or msgstr continuation line."""
1731         token = unescape(self.current_token[1:-1])
1732         if self.current_state == 'ct':
1733             self.current_entry.msgctxt += token
1734         elif self.current_state == 'mi':
1735             self.current_entry.msgid += token
1736         elif self.current_state == 'mp':
1737             self.current_entry.msgid_plural += token
1738         elif self.current_state == 'ms':
1739             self.current_entry.msgstr += token
1740         elif self.current_state == 'mx':
1741             self.current_entry.msgstr_plural[self.msgstr_index] += token
1742         elif self.current_state == 'pp':
1743             self.current_entry.previous_msgid_plural += token
1744         elif self.current_state == 'pm':
1745             self.current_entry.previous_msgid += token
1746         elif self.current_state == 'pc':
1747             self.current_entry.previous_msgctxt += token
1748         # don't change the current state
1749         return False
1750
1751
1752 # }}}
1753 # class _MOFileParser {{{
1754
1755
1756 class _MOFileParser(object):
1757     """
1758     A class to parse binary mo files.
1759     """
1760
1761     # pylint: disable=unused-argument,redefined-outer-name
1762     def __init__(self, mofile, *_args, **kwargs):
1763         """
1764         Constructor.
1765
1766         Keyword arguments:
1767
1768         ``mofile``
1769             string, path to the mo file or its content
1770
1771         ``encoding``
1772             string, the encoding to use, defaults to ``default_encoding``
1773             global variable (optional).
1774
1775         ``check_for_duplicates``
1776             whether to check for duplicate entries when adding entries to the
1777             file (optional, default: ``False``).
1778         """
1779         if _is_file(mofile):
1780             self.fhandle = open(mofile, 'rb')
1781         else:
1782             self.fhandle = io.BytesIO(mofile)
1783
1784         klass = kwargs.get('klass')
1785         if klass is None:
1786             klass = MOFile
1787         self.instance = klass(
1788             fpath=mofile,
1789             encoding=kwargs.get('encoding', default_encoding),
1790             check_for_duplicates=kwargs.get('check_for_duplicates', False),
1791         )
1792
1793     def __del__(self):
1794         """
1795         Make sure the file is closed, this prevents warnings on unclosed file
1796         when running tests with python >= 3.2.
1797         """
1798         if self.fhandle and hasattr(self.fhandle, 'close'):
1799             self.fhandle.close()
1800
1801     def parse(self):
1802         """
1803         Build the instance with the file handle provided in the
1804         constructor.
1805         """
1806         # parse magic number
1807         magic_number = self._readbinary('<I', 4)
1808         if magic_number == MOFile.MAGIC:
1809             ii = '<II'
1810         elif magic_number == MOFile.MAGIC_SWAPPED:
1811             ii = '>II'
1812         else:
1813             raise IOError('Invalid mo file, magic number is incorrect !')
1814         self.instance.magic_number = magic_number
1815         # parse the version number and the number of strings
1816         version, numofstrings = self._readbinary(ii, 8)
1817         # from MO file format specs: "A program seeing an unexpected major
1818         # revision number should stop reading the MO file entirely"
1819         if version >> 16 not in (0, 1):
1820             raise IOError('Invalid mo file, unexpected major revision number')
1821         self.instance.version = version
1822         # original strings and translation strings hash table offset
1823         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1824         # move to msgid hash table and read length and offset of msgids
1825         self.fhandle.seek(msgids_hash_offset)
1826         msgids_index = []
1827         for i in range(numofstrings):
1828             msgids_index.append(self._readbinary(ii, 8))
1829         # move to msgstr hash table and read length and offset of msgstrs
1830         self.fhandle.seek(msgstrs_hash_offset)
1831         msgstrs_index = []
1832         for i in range(numofstrings):
1833             msgstrs_index.append(self._readbinary(ii, 8))
1834         # build entries
1835         encoding = self.instance.encoding
1836         for i in range(numofstrings):
1837             self.fhandle.seek(msgids_index[i][1])
1838             msgid = self.fhandle.read(msgids_index[i][0])
1839
1840             self.fhandle.seek(msgstrs_index[i][1])
1841             msgstr = self.fhandle.read(msgstrs_index[i][0])
1842             if i == 0 and not msgid:  # metadata
1843                 raw_metadata, metadata = msgstr.split(b('\n')), {}
1844                 for line in raw_metadata:
1845                     tokens = line.split(b(':'), 1)
1846                     if tokens[0] != b(''):
1847                         try:
1848                             k = tokens[0].decode(encoding)
1849                             v = tokens[1].decode(encoding)
1850                             metadata[k] = v.strip()
1851                         except IndexError:
1852                             metadata[k] = u('')
1853                 self.instance.metadata = metadata
1854                 continue
1855             # test if we have a plural entry
1856             msgid_tokens = msgid.split(b('\0'))
1857             if len(msgid_tokens) > 1:
1858                 entry = self._build_entry(
1859                     msgid=msgid_tokens[0],
1860                     msgid_plural=msgid_tokens[1],
1861                     msgstr_plural=dict(
1862                         (k, v) for k, v in enumerate(msgstr.split(b('\0')))
1863                     ),
1864                 )
1865             else:
1866                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1867             self.instance.append(entry)
1868         # close opened file
1869         self.fhandle.close()
1870         return self.instance
1871
1872     def _build_entry(self, msgid, msgstr=None, msgid_plural=None, msgstr_plural=None):
1873         msgctxt_msgid = msgid.split(b('\x04'))
1874         encoding = self.instance.encoding
1875         if len(msgctxt_msgid) > 1:
1876             kwargs = {
1877                 'msgctxt': msgctxt_msgid[0].decode(encoding),
1878                 'msgid': msgctxt_msgid[1].decode(encoding),
1879             }
1880         else:
1881             kwargs = {'msgid': msgid.decode(encoding)}
1882         if msgstr:
1883             kwargs['msgstr'] = msgstr.decode(encoding)
1884         if msgid_plural:
1885             kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1886         if msgstr_plural:
1887             for k in msgstr_plural:
1888                 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1889             kwargs['msgstr_plural'] = msgstr_plural
1890         return MOEntry(**kwargs)
1891
1892     def _readbinary(self, fmt, numbytes):
1893         """
1894         Private method that unpack n bytes of data using format <fmt>.
1895         It returns a tuple or a mixed value if the tuple length is 1.
1896         """
1897         content = self.fhandle.read(numbytes)
1898         tup = struct.unpack(fmt, content)
1899         if len(tup) == 1:
1900             return tup[0]
1901         return tup
1902
1903
1904 # }}}