cola/polib.py

   1 # -* coding: utf-8 -*-
   2 #
   3 # License: MIT (see extras/polib/LICENSE file provided)
   4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
   5 # pylint: disable=consider-using-with,no-else-return
   6
   7 """
   8 **polib** allows you to manipulate, create, modify gettext files (pot, po and
   9 mo files).  You can load existing files, iterate through it's entries, add,
  10 modify entries, comments or metadata, etc. or create new po files from scratch.
  11
  12 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
  13 :func:`~polib.mofile` convenience functions.
  14 """
  15 from __future__ import absolute_import, division, print_function
  16 import array
  17 import codecs
  18 import os
  19 import re
  20 import struct
  21 import sys
  22 import textwrap
  23 import io
  24
  25 from . import compat
  26
  27
  28 __author__ = 'David Jean Louis <izimobil@gmail.com>'
  29 __version__ = '1.1.1'
  30 __all__ = [
  31     'pofile',
  32     'POFile',
  33     'POEntry',
  34     'mofile',
  35     'MOFile',
  36     'MOEntry',
  37     'default_encoding',
  38     'escape',
  39     'unescape',
  40     'detect_encoding',
  41 ]
  42
  43
  44 # the default encoding to use when encoding cannot be detected
  45 default_encoding = 'utf-8'
  46
  47 # python 2/3 compatibility helpers {{{
  48
  49
  50 if sys.version_info < (3,):
  51     PY3 = False
  52     text_type = compat.ustr
  53
  54     def b(s):
  55         return s
  56
  57     def u(s):
  58         return compat.ustr(s, "unicode_escape")
  59
  60 else:
  61     PY3 = True
  62     text_type = str
  63
  64     def b(s):
  65         return s.encode("utf-8")
  66
  67     def u(s):
  68         return s
  69
  70
  71 # }}}
  72 # _pofile_or_mofile {{{
  73
  74
  75 def _pofile_or_mofile(f, filetype, **kwargs):
  76     """
  77     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
  78     honor the DRY concept.
  79     """
  80     # get the file encoding
  81     enc = kwargs.get('encoding')
  82     if enc is None:
  83         enc = detect_encoding(f, filetype == 'mofile')
  84
  85     # parse the file
  86     kls = _POFileParser if filetype == 'pofile' else _MOFileParser
  87     parser = kls(
  88         f,
  89         encoding=enc,
  90         check_for_duplicates=kwargs.get('check_for_duplicates', False),
  91         klass=kwargs.get('klass'),
  92     )
  93     instance = parser.parse()
  94     instance.wrapwidth = kwargs.get('wrapwidth', 78)
  95     return instance
  96
  97
  98 # }}}
  99 # _is_file {{{
 100
 101
 102 def _is_file(filename_or_contents):
 103     """
 104     Safely returns the value of os.path.exists(filename_or_contents).
 105
 106     Arguments:
 107
 108     ``filename_or_contents``
 109         either a filename, or a string holding the contents of some file.
 110         In the latter case, this function will always return False.
 111     """
 112     try:
 113         return os.path.isfile(filename_or_contents)
 114     except (TypeError, ValueError, UnicodeEncodeError):
 115         return False
 116
 117
 118 # }}}
 119 # function pofile() {{{
 120
 121
 122 # pylint: disable=redefined-outer-name
 123 def pofile(pofile, **kwargs):
 124     """
 125     Convenience function that parses the po or pot file ``pofile`` and returns
 126     a :class:`~polib.POFile` instance.
 127
 128     Arguments:
 129
 130     ``pofile``
 131         string, full or relative path to the po/pot file or its content (data).
 132
 133     ``wrapwidth``
 134         integer, the wrap width, only useful when the ``-w`` option was passed
 135         to xgettext (optional, default: ``78``).
 136
 137     ``encoding``
 138         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 139         encoding will be auto-detected).
 140
 141     ``check_for_duplicates``
 142         whether to check for duplicate entries when adding entries to the
 143         file (optional, default: ``False``).
 144
 145     ``klass``
 146         class which is used to instantiate the return value (optional,
 147         default: ``None``, the return value with be a :class:`~polib.POFile`
 148         instance).
 149     """
 150     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
 151
 152
 153 # }}}
 154 # function mofile() {{{
 155
 156
 157 # pylint: disable=redefined-outer-name
 158 def mofile(mofile, **kwargs):
 159     """
 160     Convenience function that parses the mo file ``mofile`` and returns a
 161     :class:`~polib.MOFile` instance.
 162
 163     Arguments:
 164
 165     ``mofile``
 166         string, full or relative path to the mo file or its content (string
 167         or bytes).
 168
 169     ``wrapwidth``
 170         integer, the wrap width, only useful when the ``-w`` option was passed
 171         to xgettext to generate the po file that was used to format the mo file
 172         (optional, default: ``78``).
 173
 174     ``encoding``
 175         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 176         encoding will be auto-detected).
 177
 178     ``check_for_duplicates``
 179         whether to check for duplicate entries when adding entries to the
 180         file (optional, default: ``False``).
 181
 182     ``klass``
 183         class which is used to instantiate the return value (optional,
 184         default: ``None``, the return value with be a :class:`~polib.POFile`
 185         instance).
 186     """
 187     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
 188
 189
 190 # }}}
 191 # function detect_encoding() {{{
 192
 193
 194 def detect_encoding(file, binary_mode=False):
 195     """
 196     Try to detect the encoding used by the ``file``. The ``file`` argument can
 197     be a PO or MO file path or a string containing the contents of the file.
 198     If the encoding cannot be detected, the function will return the value of
 199     ``default_encoding``.
 200
 201     Arguments:
 202
 203     ``file``
 204         string, full or relative path to the po/mo file or its content.
 205
 206     ``binary_mode``
 207         boolean, set this to True if ``file`` is a mo file.
 208     """
 209     PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
 210     rxt = re.compile(u(PATTERN))
 211     rxb = re.compile(b(PATTERN))
 212
 213     def charset_exists(charset):
 214         """Check whether ``charset`` is valid or not."""
 215         try:
 216             codecs.lookup(charset)
 217         except LookupError:
 218             return False
 219         return True
 220
 221     if not _is_file(file):
 222         try:
 223             match = rxt.search(file)
 224         except TypeError:
 225             match = rxb.search(file)
 226         if match:
 227             enc = match.group(1).strip()
 228             if not isinstance(enc, text_type):
 229                 enc = enc.decode('utf-8')
 230             if charset_exists(enc):
 231                 return enc
 232     else:
 233         # For PY3, always treat as binary
 234         if binary_mode or PY3:
 235             mode = 'rb'
 236             rx = rxb
 237         else:
 238             mode = 'r'
 239             rx = rxt
 240         f = open(file, mode)
 241         for line in f.readlines():
 242             match = rx.search(line)
 243             if match:
 244                 f.close()
 245                 enc = match.group(1).strip()
 246                 if not isinstance(enc, text_type):
 247                     enc = enc.decode('utf-8')
 248                 if charset_exists(enc):
 249                     return enc
 250         f.close()
 251     return default_encoding
 252
 253
 254 # }}}
 255 # function escape() {{{
 256
 257
 258 def escape(st):
 259     """
 260     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 261     the given string ``st`` and returns it.
 262     """
 263     return (
 264         st.replace('\\', r'\\')
 265         .replace('\t', r'\t')
 266         .replace('\r', r'\r')
 267         .replace('\n', r'\n')
 268         .replace('\"', r'\"')
 269     )
 270
 271
 272 # }}}
 273 # function unescape() {{{
 274
 275
 276 def unescape(st):
 277     """
 278     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 279     the given string ``st`` and returns it.
 280     """
 281
 282     def unescape_repl(m):
 283         m = m.group(1)
 284         if m == 'n':
 285             return '\n'
 286         if m == 't':
 287             return '\t'
 288         if m == 'r':
 289             return '\r'
 290         if m == '\\':
 291             return '\\'
 292         return m  # handles escaped double quote
 293
 294     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
 295
 296
 297 # }}}
 298 # function natural_sort() {{{
 299
 300
 301 def natural_sort(lst):
 302     """
 303     Sort naturally the given list.
 304     Credits: http://stackoverflow.com/a/4836734
 305     """
 306
 307     def convert(text):
 308         return int(text) if text.isdigit() else text.lower()
 309
 310     def alphanum_key(key):
 311         return [convert(c) for c in re.split('([0-9]+)', key)]
 312
 313     return sorted(lst, key=alphanum_key)
 314
 315
 316 # }}}
 317 # class _BaseFile {{{
 318
 319
 320 class _BaseFile(list):
 321     """
 322     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
 323     classes. This class should **not** be instantiated directly.
 324     """
 325
 326     def __init__(self, *_args, **kwargs):
 327         """
 328         Constructor, accepts the following keyword arguments:
 329
 330         ``pofile``
 331             string, the path to the po or mo file, or its content as a string.
 332
 333         ``wrapwidth``
 334             integer, the wrap width, only useful when the ``-w`` option was
 335             passed to xgettext (optional, default: ``78``).
 336
 337         ``encoding``
 338             string, the encoding to use, defaults to ``default_encoding``
 339             global variable (optional).
 340
 341         ``check_for_duplicates``
 342             whether to check for duplicate entries when adding entries to the
 343             file, (optional, default: ``False``).
 344         """
 345         list.__init__(self)
 346         # the opened file handle
 347         pofile = kwargs.get('pofile', None)  # pylint: disable=redefined-outer-name
 348         if pofile and _is_file(pofile):
 349             self.fpath = pofile
 350         else:
 351             self.fpath = kwargs.get('fpath')
 352         # the width at which lines should be wrapped
 353         self.wrapwidth = kwargs.get('wrapwidth', 78)
 354         # the file encoding
 355         self.encoding = kwargs.get('encoding', default_encoding)
 356         # whether to check for duplicate entries or not
 357         self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
 358         # header
 359         self.header = ''
 360         # both po and mo files have metadata
 361         self.metadata = {}
 362         self.metadata_is_fuzzy = 0
 363
 364     def __unicode__(self):
 365         """
 366         Returns the unicode representation of the file.
 367         """
 368         ret = []
 369         entries = [self.metadata_as_entry()] + [e for e in self if not e.obsolete]
 370         for entry in entries:
 371             ret.append(entry.__unicode__(self.wrapwidth))
 372         for entry in self.obsolete_entries():  # pylint: disable=no-member
 373             ret.append(entry.__unicode__(self.wrapwidth))
 374         ret = u('\n').join(ret)
 375         return ret
 376
 377     if PY3:
 378
 379         def __str__(self):
 380             return self.__unicode__()
 381
 382     else:
 383
 384         def __str__(self):
 385             """
 386             Returns the string representation of the file.
 387             """
 388             return compat.ustr(self).encode(self.encoding)
 389
 390     def __contains__(self, entry):
 391         """
 392         Overridden ``list`` method to implement the membership test (in and
 393         not in).
 394         The method considers that an entry is in the file if it finds an entry
 395         that has the same msgid (the test is **case sensitive**) and the same
 396         msgctxt (or none for both entries).
 397
 398         Argument:
 399
 400         ``entry``
 401             an instance of :class:`~polib._BaseEntry`.
 402         """
 403         return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) is not None
 404
 405     def __eq__(self, other):
 406         return str(self) == str(other)
 407
 408     def __hash__(self):
 409         return hash(str(self))
 410
 411     def append(self, entry):
 412         """
 413         Overridden method to check for duplicates entries, if a user tries to
 414         add an entry that is already in the file, the method will raise a
 415         ``ValueError`` exception.
 416
 417         Argument:
 418
 419         ``entry``
 420             an instance of :class:`~polib._BaseEntry`.
 421         """
 422         # check_for_duplicates may not be defined (yet) when unpickling.
 423         # But if pickling, we never want to check for duplicates anyway.
 424         if getattr(self, 'check_for_duplicates', False) and entry in self:
 425             raise ValueError('Entry "%s" already exists' % entry.msgid)
 426         super(_BaseFile, self).append(entry)
 427
 428     def insert(self, index, entry):
 429         """
 430         Overridden method to check for duplicates entries, if a user tries to
 431         add an entry that is already in the file, the method will raise a
 432         ``ValueError`` exception.
 433
 434         Arguments:
 435
 436         ``index``
 437             index at which the entry should be inserted.
 438
 439         ``entry``
 440             an instance of :class:`~polib._BaseEntry`.
 441         """
 442         if self.check_for_duplicates and entry in self:
 443             raise ValueError('Entry "%s" already exists' % entry.msgid)
 444         super(_BaseFile, self).insert(index, entry)
 445
 446     def metadata_as_entry(self):
 447         """
 448         Returns the file metadata as a :class:`~polib.POFile` instance.
 449         """
 450         e = POEntry(msgid='')
 451         mdata = self.ordered_metadata()
 452         if mdata:
 453             strs = []
 454             for name, value in mdata:
 455                 # Strip whitespace off each line in a multi-line entry
 456                 strs.append('%s: %s' % (name, value))
 457             e.msgstr = '\n'.join(strs) + '\n'
 458         if self.metadata_is_fuzzy:
 459             e.flags.append('fuzzy')
 460         return e
 461
 462     def save(self, fpath=None, repr_method='__unicode__', newline=None):
 463         """
 464         Saves the po file to ``fpath``.
 465         If it is an existing file and no ``fpath`` is provided, then the
 466         existing file is rewritten with the modified data.
 467
 468         Keyword arguments:
 469
 470         ``fpath``
 471             string, full or relative path to the file.
 472
 473         ``repr_method``
 474             string, the method to use for output.
 475
 476         ``newline``
 477             string, controls how universal newlines works
 478         """
 479         if self.fpath is None and fpath is None:
 480             raise IOError('You must provide a file path to save() method')
 481         contents = getattr(self, repr_method)()
 482         if fpath is None:
 483             fpath = self.fpath
 484         if repr_method == 'to_binary':
 485             fhandle = open(fpath, 'wb')
 486         else:
 487             fhandle = io.open(fpath, 'w', encoding=self.encoding, newline=newline)
 488             if not isinstance(contents, text_type):
 489                 contents = contents.decode(self.encoding)
 490         fhandle.write(contents)
 491         fhandle.close()
 492         # set the file path if not set
 493         if self.fpath is None and fpath:
 494             self.fpath = fpath
 495
 496     def find(self, st, by='msgid', include_obsolete_entries=False, msgctxt=False):
 497         """
 498         Find the entry which msgid (or property identified by the ``by``
 499         argument) matches the string ``st``.
 500
 501         Keyword arguments:
 502
 503         ``st``
 504             string, the string to search for.
 505
 506         ``by``
 507             string, the property to use for comparison (default: ``msgid``).
 508
 509         ``include_obsolete_entries``
 510             boolean, whether to also search in entries that are obsolete.
 511
 512         ``msgctxt``
 513             string, allows specifying a specific message context for the
 514             search.
 515         """
 516         if include_obsolete_entries:
 517             entries = self[:]
 518         else:
 519             entries = [e for e in self if not e.obsolete]
 520         matches = []
 521         for e in entries:
 522             if getattr(e, by) == st:
 523                 if msgctxt is not False and e.msgctxt != msgctxt:
 524                     continue
 525                 matches.append(e)
 526         if len(matches) == 1:
 527             return matches[0]
 528         elif len(matches) > 1:
 529             if not msgctxt:
 530                 # find the entry with no msgctx
 531                 e = None
 532                 for m in matches:
 533                     if not m.msgctxt:
 534                         e = m
 535                 if e:
 536                     return e
 537                 # fallback to the first entry found
 538                 return matches[0]
 539         return None
 540
 541     def ordered_metadata(self):
 542         """
 543         Convenience method that returns an ordered version of the metadata
 544         dictionary. The return value is list of tuples (metadata name,
 545         metadata_value).
 546         """
 547         # copy the dict first
 548         metadata = self.metadata.copy()
 549         data_order = [
 550             'Project-Id-Version',
 551             'Report-Msgid-Bugs-To',
 552             'POT-Creation-Date',
 553             'PO-Revision-Date',
 554             'Last-Translator',
 555             'Language-Team',
 556             'Language',
 557             'MIME-Version',
 558             'Content-Type',
 559             'Content-Transfer-Encoding',
 560             'Plural-Forms',
 561         ]
 562         ordered_data = []
 563         for data in data_order:
 564             try:
 565                 value = metadata.pop(data)
 566                 ordered_data.append((data, value))
 567             except KeyError:
 568                 pass
 569         # the rest of the metadata will be alphabetically ordered since there
 570         # are no specs for this AFAIK
 571         for data in natural_sort(metadata.keys()):
 572             value = metadata[data]
 573             ordered_data.append((data, value))
 574         return ordered_data
 575
 576     def to_binary(self):
 577         """
 578         Return the binary representation of the file.
 579         """
 580         offsets = []
 581         entries = self.translated_entries()  # pylint: disable=no-member
 582
 583         # the keys are sorted in the .mo file
 584         def cmp(_self, other):  # pylint: disable=unused-variable
 585             # msgfmt compares entries with msgctxt if it exists
 586             self_msgid = _self.msgctxt or _self.msgid
 587             other_msgid = other.msgctxt or other.msgid
 588             if self_msgid > other_msgid:
 589                 return 1
 590             elif self_msgid < other_msgid:
 591                 return -1
 592             else:
 593                 return 0
 594
 595         # add metadata entry
 596         entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
 597         mentry = self.metadata_as_entry()
 598         entries = [mentry] + entries
 599         entries_len = len(entries)
 600         ids, strs = b(''), b('')
 601         for e in entries:
 602             # For each string, we need size and file offset.  Each string is
 603             # NUL terminated; the NUL does not count into the size.
 604             msgid = b('')
 605             if e.msgctxt:
 606                 # Contexts are stored by storing the concatenation of the
 607                 # context, a <EOT> byte, and the original string
 608                 msgid = self._encode(e.msgctxt + '\4')
 609             if e.msgid_plural:
 610                 msgstr = []
 611                 for index in sorted(e.msgstr_plural.keys()):
 612                     msgstr.append(e.msgstr_plural[index])
 613                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
 614                 msgstr = self._encode('\0'.join(msgstr))
 615             else:
 616                 msgid += self._encode(e.msgid)
 617                 msgstr = self._encode(e.msgstr)
 618             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
 619             ids += msgid + b('\0')
 620             strs += msgstr + b('\0')
 621
 622         # The header is 7 32-bit unsigned integers.
 623         keystart = 7 * 4 + 16 * entries_len
 624         # and the values start after the keys
 625         valuestart = keystart + len(ids)
 626         koffsets = []
 627         voffsets = []
 628         # The string table first has the list of keys, then the list of values.
 629         # Each entry has first the size of the string, then the file offset.
 630         for o1, l1, o2, l2 in offsets:
 631             koffsets += [l1, o1 + keystart]
 632             voffsets += [l2, o2 + valuestart]
 633         offsets = koffsets + voffsets
 634
 635         output = struct.pack(
 636             "Iiiiiii",
 637             # Magic number
 638             MOFile.MAGIC,
 639             # Version
 640             0,
 641             # number of entries
 642             entries_len,
 643             # start of key index
 644             7 * 4,
 645             # start of value index
 646             7 * 4 + entries_len * 8,
 647             # size and offset of hash table, we don't use hash tables
 648             0,
 649             keystart,
 650         )
 651         if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
 652             output += array.array("i", offsets).tobytes()
 653         else:
 654             output += array.array("i", offsets).tostring()  # pylint: disable=no-member
 655         output += ids
 656         output += strs
 657         return output
 658
 659     def _encode(self, mixed):
 660         """
 661         Encodes the given ``mixed`` argument with the file encoding if and
 662         only if it's an unicode string and returns the encoded string.
 663         """
 664         if isinstance(mixed, text_type):
 665             mixed = mixed.encode(self.encoding)
 666         return mixed
 667
 668
 669 # }}}
 670 # class POFile {{{
 671
 672
 673 class POFile(_BaseFile):
 674     """
 675     Po (or Pot) file reader/writer.
 676     This class inherits the :class:`~polib._BaseFile` class and, by extension,
 677     the python ``list`` type.
 678     """
 679
 680     def __unicode__(self):
 681         """
 682         Returns the unicode representation of the po file.
 683         """
 684         ret, headers = '', self.header.split('\n')
 685         for header in headers:
 686             if not header:
 687                 ret += "#\n"
 688             elif header[:1] in [',', ':']:
 689                 ret += '#%s\n' % header
 690             else:
 691                 ret += '# %s\n' % header
 692
 693         if not isinstance(ret, text_type):
 694             ret = ret.decode(self.encoding)
 695
 696         return ret + _BaseFile.__unicode__(self)
 697
 698     def save_as_mofile(self, fpath):
 699         """
 700         Saves the binary representation of the file to given ``fpath``.
 701
 702         Keyword argument:
 703
 704         ``fpath``
 705             string, full or relative path to the mo file.
 706         """
 707         _BaseFile.save(self, fpath, 'to_binary')
 708
 709     def percent_translated(self):
 710         """
 711         Convenience method that returns the percentage of translated
 712         messages.
 713         """
 714         total = len([e for e in self if not e.obsolete])
 715         if total == 0:
 716             return 100
 717         translated = len(self.translated_entries())
 718         return int(translated * 100 / float(total))
 719
 720     def translated_entries(self):
 721         """
 722         Convenience method that returns the list of translated entries.
 723         """
 724         return [e for e in self if e.translated()]
 725
 726     def untranslated_entries(self):
 727         """
 728         Convenience method that returns the list of untranslated entries.
 729         """
 730         return [
 731             e for e in self if not e.translated() and not e.obsolete and not e.fuzzy
 732         ]
 733
 734     def fuzzy_entries(self):
 735         """
 736         Convenience method that returns the list of fuzzy entries.
 737         """
 738         return [e for e in self if e.fuzzy and not e.obsolete]
 739
 740     def obsolete_entries(self):
 741         """
 742         Convenience method that returns the list of obsolete entries.
 743         """
 744         return [e for e in self if e.obsolete]
 745
 746     def merge(self, refpot):
 747         """
 748         Convenience method that merges the current pofile with the pot file
 749         provided. It behaves exactly as the gettext msgmerge utility:
 750
 751         * comments of this file will be preserved, but extracted comments and
 752           occurrences will be discarded;
 753         * any translations or comments in the file will be discarded, however,
 754           dot comments and file positions will be preserved;
 755         * the fuzzy flags are preserved.
 756
 757         Keyword argument:
 758
 759         ``refpot``
 760             object POFile, the reference catalog.
 761         """
 762         # Store entries in dict/set for faster access
 763         self_entries = dict((entry.msgid_with_context, entry) for entry in self)
 764         refpot_msgids = set(entry.msgid_with_context for entry in refpot)
 765         # Merge entries that are in the refpot
 766         for entry in refpot:
 767             e = self_entries.get(entry.msgid_with_context)
 768             if e is None:
 769                 e = POEntry()
 770                 self.append(e)
 771             e.merge(entry)
 772         # ok, now we must "obsolete" entries that are not in the refpot anymore
 773         for entry in self:
 774             if entry.msgid_with_context not in refpot_msgids:
 775                 entry.obsolete = True
 776
 777
 778 # }}}
 779 # class MOFile {{{
 780
 781
 782 class MOFile(_BaseFile):
 783     """
 784     Mo file reader/writer.
 785     This class inherits the :class:`~polib._BaseFile` class and, by
 786     extension, the python ``list`` type.
 787     """
 788
 789     MAGIC = 0x950412DE
 790     MAGIC_SWAPPED = 0xDE120495
 791
 792     def __init__(self, *args, **kwargs):
 793         """
 794         Constructor, accepts all keywords arguments accepted by
 795         :class:`~polib._BaseFile` class.
 796         """
 797         _BaseFile.__init__(self, *args, **kwargs)
 798         self.magic_number = None
 799         self.version = 0
 800
 801     def save_as_pofile(self, fpath):
 802         """
 803         Saves the mofile as a pofile to ``fpath``.
 804
 805         Keyword argument:
 806
 807         ``fpath``
 808             string, full or relative path to the file.
 809         """
 810         _BaseFile.save(self, fpath)
 811
 812     # pylint: disable=arguments-differ
 813     def save(self, fpath=None):
 814         """
 815         Saves the mofile to ``fpath``.
 816
 817         Keyword argument:
 818
 819         ``fpath``
 820             string, full or relative path to the file.
 821         """
 822         _BaseFile.save(self, fpath, 'to_binary')
 823
 824     def percent_translated(self):
 825         """
 826         Convenience method to keep the same interface with POFile instances.
 827         """
 828         return 100
 829
 830     def translated_entries(self):
 831         """
 832         Convenience method to keep the same interface with POFile instances.
 833         """
 834         return self
 835
 836     def untranslated_entries(self):
 837         """
 838         Convenience method to keep the same interface with POFile instances.
 839         """
 840         return []
 841
 842     def fuzzy_entries(self):
 843         """
 844         Convenience method to keep the same interface with POFile instances.
 845         """
 846         return []
 847
 848     def obsolete_entries(self):
 849         """
 850         Convenience method to keep the same interface with POFile instances.
 851         """
 852         return []
 853
 854
 855 # }}}
 856 # class _BaseEntry {{{
 857
 858
 859 class _BaseEntry(object):
 860     """
 861     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
 862     This class should **not** be instantiated directly.
 863     """
 864
 865     def __init__(self, *_args, **kwargs):
 866         """
 867         Constructor, accepts the following keyword arguments:
 868
 869         ``msgid``
 870             string, the entry msgid.
 871
 872         ``msgstr``
 873             string, the entry msgstr.
 874
 875         ``msgid_plural``
 876             string, the entry msgid_plural.
 877
 878         ``msgstr_plural``
 879             dict, the entry msgstr_plural lines.
 880
 881         ``msgctxt``
 882             string, the entry context (msgctxt).
 883
 884         ``obsolete``
 885             bool, whether the entry is "obsolete" or not.
 886
 887         ``encoding``
 888             string, the encoding to use, defaults to ``default_encoding``
 889             global variable (optional).
 890         """
 891         self.msgid = kwargs.get('msgid', '')
 892         self.msgstr = kwargs.get('msgstr', '')
 893         self.msgid_plural = kwargs.get('msgid_plural', '')
 894         self.msgstr_plural = kwargs.get('msgstr_plural', {})
 895         self.msgctxt = kwargs.get('msgctxt', None)
 896         self.obsolete = kwargs.get('obsolete', False)
 897         self.encoding = kwargs.get('encoding', default_encoding)
 898
 899     def __unicode__(self, wrapwidth=78):
 900         """
 901         Returns the unicode representation of the entry.
 902         """
 903         if self.obsolete:
 904             delflag = '#~ '
 905         else:
 906             delflag = ''
 907         ret = []
 908         # write the msgctxt if any
 909         if self.msgctxt is not None:
 910             ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth)
 911         # write the msgid
 912         ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
 913         # write the msgid_plural if any
 914         if self.msgid_plural:
 915             ret += self._str_field(
 916                 "msgid_plural", delflag, "", self.msgid_plural, wrapwidth
 917             )
 918         if self.msgstr_plural:
 919             # write the msgstr_plural if any
 920             msgstrs = self.msgstr_plural
 921             keys = list(msgstrs)
 922             keys.sort()
 923             for index in keys:
 924                 msgstr = msgstrs[index]
 925                 plural_index = '[%s]' % index
 926                 ret += self._str_field(
 927                     "msgstr", delflag, plural_index, msgstr, wrapwidth
 928                 )
 929         else:
 930             # otherwise write the msgstr
 931             ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth)
 932         ret.append('')
 933         ret = u('\n').join(ret)
 934         return ret
 935
 936     if PY3:
 937
 938         def __str__(self):
 939             return self.__unicode__()
 940
 941     else:
 942
 943         def __str__(self):
 944             """
 945             Returns the string representation of the entry.
 946             """
 947             return compat.ustr(self).encode(self.encoding)
 948
 949     def __eq__(self, other):
 950         return str(self) == str(other)
 951
 952     def __hash__(self):
 953         return hash(str(self))
 954
 955     def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78):
 956         lines = field.splitlines(True)
 957         if len(lines) > 1:
 958             lines = [''] + lines  # start with initial empty line
 959         else:
 960             escaped_field = escape(field)
 961             specialchars_count = 0
 962             for c in ['\\', '\n', '\r', '\t', '"']:
 963                 specialchars_count += field.count(c)
 964             # comparison must take into account fieldname length + one space
 965             # + 2 quotes (eg. msgid "<string>")
 966             flength = len(fieldname) + 3
 967             if plural_index:
 968                 flength += len(plural_index)
 969             real_wrapwidth = wrapwidth - flength + specialchars_count
 970             if wrapwidth > 0 and len(field) > real_wrapwidth:
 971                 # Wrap the line but take field name into account
 972                 lines = [''] + [
 973                     unescape(item)
 974                     for item in textwrap.wrap(
 975                         escaped_field,
 976                         wrapwidth - 2,  # 2 for quotes ""
 977                         drop_whitespace=False,
 978                         break_long_words=False,
 979                     )
 980                 ]
 981             else:
 982                 lines = [field]
 983         if fieldname.startswith('previous_'):
 984             # quick and dirty trick to get the real field name
 985             fieldname = fieldname[9:]
 986
 987         ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, escape(lines.pop(0)))]
 988         for line in lines:
 989             ret.append('%s"%s"' % (delflag, escape(line)))
 990         return ret
 991
 992     @property
 993     def msgid_with_context(self):
 994         if self.msgctxt:
 995             return '%s%s%s' % (self.msgctxt, "\x04", self.msgid)
 996         return self.msgid
 997
 998
 999 # }}}
1000 # class POEntry {{{
1001
1002
1003 class POEntry(_BaseEntry):
1004     """
1005     Represents a po file entry.
1006     """
1007
1008     def __init__(self, *args, **kwargs):
1009         """
1010         Constructor, accepts the following keyword arguments:
1011
1012         ``comment``
1013             string, the entry comment.
1014
1015         ``tcomment``
1016             string, the entry translator comment.
1017
1018         ``occurrences``
1019             list, the entry occurrences.
1020
1021         ``flags``
1022             list, the entry flags.
1023
1024         ``previous_msgctxt``
1025             string, the entry previous context.
1026
1027         ``previous_msgid``
1028             string, the entry previous msgid.
1029
1030         ``previous_msgid_plural``
1031             string, the entry previous msgid_plural.
1032
1033         ``linenum``
1034             integer, the line number of the entry
1035         """
1036         _BaseEntry.__init__(self, *args, **kwargs)
1037         self.comment = kwargs.get('comment', '')
1038         self.tcomment = kwargs.get('tcomment', '')
1039         self.occurrences = kwargs.get('occurrences', [])
1040         self.flags = kwargs.get('flags', [])
1041         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
1042         self.previous_msgid = kwargs.get('previous_msgid', None)
1043         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
1044         self.linenum = kwargs.get('linenum', None)
1045
1046     def __unicode__(self, wrapwidth=78):
1047         """
1048         Returns the unicode representation of the entry.
1049         """
1050         ret = []
1051         # comments first, if any (with text wrapping as xgettext does)
1052         if self.obsolete:
1053             comments = [('tcomment', '# ')]
1054         else:
1055             comments = [('comment', '#. '), ('tcomment', '# ')]
1056         for c in comments:
1057             val = getattr(self, c[0])
1058             if val:
1059                 for comment in val.split('\n'):
1060                     if len(comment) + len(c[1]) > wrapwidth > 0:
1061                         ret += textwrap.wrap(
1062                             comment,
1063                             wrapwidth,
1064                             initial_indent=c[1],
1065                             subsequent_indent=c[1],
1066                             break_long_words=False,
1067                         )
1068                     else:
1069                         ret.append('%s%s' % (c[1], comment))
1070
1071         # occurrences (with text wrapping as xgettext does)
1072         if not self.obsolete and self.occurrences:
1073             filelist = []
1074             for fpath, lineno in self.occurrences:
1075                 if lineno:
1076                     filelist.append('%s:%s' % (fpath, lineno))
1077                 else:
1078                     filelist.append(fpath)
1079             filestr = ' '.join(filelist)
1080             if len(filestr) + 3 > wrapwidth > 0:
1081                 # textwrap split words that contain hyphen, this is not
1082                 # what we want for filenames, so the dirty hack is to
1083                 # temporally replace hyphens with a char that a file cannot
1084                 # contain, like "*"
1085                 ret += [
1086                     line.replace('*', '-')
1087                     for line in textwrap.wrap(
1088                         filestr.replace('-', '*'),
1089                         wrapwidth,
1090                         initial_indent='#: ',
1091                         subsequent_indent='#: ',
1092                         break_long_words=False,
1093                     )
1094                 ]
1095             else:
1096                 ret.append('#: ' + filestr)
1097
1098         # flags (TODO: wrapping ?)
1099         if self.flags:
1100             ret.append('#, %s' % ', '.join(self.flags))
1101
1102         # previous context and previous msgid/msgid_plural
1103         fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
1104         if self.obsolete:
1105             prefix = "#~| "
1106         else:
1107             prefix = "#| "
1108         for f in fields:
1109             val = getattr(self, f)
1110             if val is not None:
1111                 ret += self._str_field(f, prefix, "", val, wrapwidth)
1112
1113         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1114         ret = u('\n').join(ret)
1115         return ret
1116
1117     # pylint: disable=too-many-return-statements
1118     def __cmp__(self, other):
1119         """
1120         Called by comparison operations if rich comparison is not defined.
1121         """
1122         # First: Obsolete test
1123         if self.obsolete != other.obsolete:
1124             if self.obsolete:
1125                 return -1
1126             else:
1127                 return 1
1128         # Work on a copy to protect original
1129         occ1 = sorted(self.occurrences[:])
1130         occ2 = sorted(other.occurrences[:])
1131         if occ1 > occ2:
1132             return 1
1133         if occ1 < occ2:
1134             return -1
1135         # Compare context
1136         msgctxt = self.msgctxt or '0'
1137         othermsgctxt = other.msgctxt or '0'
1138         if msgctxt > othermsgctxt:
1139             return 1
1140         elif msgctxt < othermsgctxt:
1141             return -1
1142         # Compare msgid_plural
1143         msgid_plural = self.msgid_plural or '0'
1144         othermsgid_plural = other.msgid_plural or '0'
1145         if msgid_plural > othermsgid_plural:
1146             return 1
1147         elif msgid_plural < othermsgid_plural:
1148             return -1
1149         # Compare msgstr_plural
1150         if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1151             msgstr_plural = list(self.msgstr_plural.values())
1152         else:
1153             msgstr_plural = []
1154         if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1155             othermsgstr_plural = list(other.msgstr_plural.values())
1156         else:
1157             othermsgstr_plural = []
1158         if msgstr_plural > othermsgstr_plural:
1159             return 1
1160         elif msgstr_plural < othermsgstr_plural:
1161             return -1
1162         # Compare msgid
1163         if self.msgid > other.msgid:
1164             return 1
1165         elif self.msgid < other.msgid:
1166             return -1
1167         # Compare msgstr
1168         if self.msgstr > other.msgstr:
1169             return 1
1170         elif self.msgstr < other.msgstr:
1171             return -1
1172         return 0
1173
1174     def __gt__(self, other):
1175         return self.__cmp__(other) > 0
1176
1177     def __lt__(self, other):
1178         return self.__cmp__(other) < 0
1179
1180     def __ge__(self, other):
1181         return self.__cmp__(other) >= 0
1182
1183     def __le__(self, other):
1184         return self.__cmp__(other) <= 0
1185
1186     def __eq__(self, other):
1187         return self.__cmp__(other) == 0
1188
1189     def __ne__(self, other):
1190         return self.__cmp__(other) != 0
1191
1192     def translated(self):
1193         """
1194         Returns ``True`` if the entry has been translated or ``False``
1195         otherwise.
1196         """
1197         if self.obsolete or self.fuzzy:
1198             return False
1199         if self.msgstr != '':
1200             return True
1201         if self.msgstr_plural:
1202             for pos in self.msgstr_plural:
1203                 if self.msgstr_plural[pos] == '':
1204                     return False
1205             return True
1206         return False
1207
1208     def merge(self, other):
1209         """
1210         Merge the current entry with the given pot entry.
1211         """
1212         self.msgid = other.msgid
1213         self.msgctxt = other.msgctxt
1214         self.occurrences = other.occurrences
1215         self.comment = other.comment
1216         fuzzy = self.fuzzy
1217         self.flags = other.flags[:]  # clone flags
1218         if fuzzy:
1219             self.flags.append('fuzzy')
1220         self.msgid_plural = other.msgid_plural
1221         self.obsolete = other.obsolete
1222         self.previous_msgctxt = other.previous_msgctxt
1223         self.previous_msgid = other.previous_msgid
1224         self.previous_msgid_plural = other.previous_msgid_plural
1225         if other.msgstr_plural:
1226             for pos in other.msgstr_plural:
1227                 try:
1228                     # keep existing translation at pos if any
1229                     self.msgstr_plural[pos]
1230                 except KeyError:
1231                     self.msgstr_plural[pos] = ''
1232
1233     @property
1234     def fuzzy(self):
1235         return 'fuzzy' in self.flags
1236
1237     def __hash__(self):
1238         return hash((self.msgid, self.msgstr))
1239
1240
1241 # }}}
1242 # class MOEntry {{{
1243
1244
1245 class MOEntry(_BaseEntry):
1246     """
1247     Represents a mo file entry.
1248     """
1249
1250     def __init__(self, *args, **kwargs):
1251         """
1252         Constructor, accepts the following keyword arguments,
1253         for consistency with :class:`~polib.POEntry`:
1254
1255         ``comment``
1256         ``tcomment``
1257         ``occurrences``
1258         ``flags``
1259         ``previous_msgctxt``
1260         ``previous_msgid``
1261         ``previous_msgid_plural``
1262
1263         Note: even though these keyword arguments are accepted,
1264         they hold no real meaning in the context of MO files
1265         and are simply ignored.
1266         """
1267         _BaseEntry.__init__(self, *args, **kwargs)
1268         self.comment = ''
1269         self.tcomment = ''
1270         self.occurrences = []
1271         self.flags = []
1272         self.previous_msgctxt = None
1273         self.previous_msgid = None
1274         self.previous_msgid_plural = None
1275
1276     def __hash__(self):
1277         return hash((self.msgid, self.msgstr))
1278
1279
1280 # }}}
1281 # class _POFileParser {{{
1282
1283
1284 class _POFileParser(object):
1285     """
1286     A finite state machine to parse efficiently and correctly po
1287     file format.
1288     """
1289
1290     # pylint: disable=redefined-outer-name
1291     def __init__(self, pofile, *_args, **kwargs):
1292         """
1293         Constructor.
1294
1295         Keyword arguments:
1296
1297         ``pofile``
1298             string, path to the po file or its content
1299
1300         ``encoding``
1301             string, the encoding to use, defaults to ``default_encoding``
1302             global variable (optional).
1303
1304         ``check_for_duplicates``
1305             whether to check for duplicate entries when adding entries to the
1306             file (optional, default: ``False``).
1307         """
1308         enc = kwargs.get('encoding', default_encoding)
1309         if _is_file(pofile):
1310             try:
1311                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1312             except LookupError:
1313                 enc = default_encoding
1314                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1315         else:
1316             self.fhandle = pofile.splitlines()
1317
1318         klass = kwargs.get('klass')
1319         if klass is None:
1320             klass = POFile
1321         self.instance = klass(
1322             pofile=pofile,
1323             encoding=enc,
1324             check_for_duplicates=kwargs.get('check_for_duplicates', False),
1325         )
1326         self.transitions = {}
1327         self.current_line = 0
1328         self.current_entry = POEntry(linenum=self.current_line)
1329         self.current_state = 'st'
1330         self.current_token = None
1331         # two memo flags used in handlers
1332         self.msgstr_index = 0
1333         self.entry_obsolete = 0
1334         # Configure the state machine, by adding transitions.
1335         # Signification of symbols:
1336         #     * ST: Beginning of the file (start)
1337         #     * HE: Header
1338         #     * TC: a translation comment
1339         #     * GC: a generated comment
1340         #     * OC: a file/line occurrence
1341         #     * FL: a flags line
1342         #     * CT: a message context
1343         #     * PC: a previous msgctxt
1344         #     * PM: a previous msgid
1345         #     * PP: a previous msgid_plural
1346         #     * MI: a msgid
1347         #     * MP: a msgid plural
1348         #     * MS: a msgstr
1349         #     * MX: a msgstr plural
1350         #     * MC: a msgid or msgstr continuation line
1351         # pylint: disable=redefined-builtin
1352         all = [
1353             'st',
1354             'he',
1355             'gc',
1356             'oc',
1357             'fl',
1358             'ct',
1359             'pc',
1360             'pm',
1361             'pp',
1362             'tc',
1363             'ms',
1364             'mp',
1365             'mx',
1366             'mi',
1367         ]
1368
1369         self.add('tc', ['st', 'he'], 'he')
1370         self.add(
1371             'tc',
1372             ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mp', 'mx', 'mi'],
1373             'tc',
1374         )
1375         self.add('gc', all, 'gc')
1376         self.add('oc', all, 'oc')
1377         self.add('fl', all, 'fl')
1378         self.add('pc', all, 'pc')
1379         self.add('pm', all, 'pm')
1380         self.add('pp', all, 'pp')
1381         self.add(
1382             'ct',
1383             ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1384             'ct',
1385         )
1386         self.add(
1387             'mi',
1388             ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1389             'mi',
1390         )
1391         self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1392         self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1393         self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1394         self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1395
1396     # pylint: disable=too-many-branches
1397     def parse(self):
1398         """
1399         Run the state machine, parse the file line by line and call process()
1400         with the current matched symbol.
1401         """
1402
1403         keywords = {
1404             'msgctxt': 'ct',
1405             'msgid': 'mi',
1406             'msgstr': 'ms',
1407             'msgid_plural': 'mp',
1408         }
1409         prev_keywords = {
1410             'msgid_plural': 'pp',
1411             'msgid': 'pm',
1412             'msgctxt': 'pc',
1413         }
1414         tokens = []
1415         fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1416         for line in self.fhandle:
1417             self.current_line += 1
1418             if self.current_line == 1:
1419                 BOM = codecs.BOM_UTF8.decode('utf-8')
1420                 if line.startswith(BOM):
1421                     line = line[len(BOM) :]
1422             line = line.strip()
1423             if line == '':
1424                 continue
1425
1426             tokens = line.split(None, 2)
1427             nb_tokens = len(tokens)
1428
1429             if tokens[0] == '#~|':
1430                 continue
1431
1432             if tokens[0] == '#~' and nb_tokens > 1:
1433                 line = line[3:].strip()
1434                 tokens = tokens[1:]
1435                 nb_tokens -= 1
1436                 self.entry_obsolete = 1
1437             else:
1438                 self.entry_obsolete = 0
1439
1440             # Take care of keywords like
1441             # msgid, msgid_plural, msgctxt & msgstr.
1442             if tokens[0] in keywords and nb_tokens > 1:
1443                 line = line[len(tokens[0]) :].lstrip()
1444                 if re.search(r'([^\\]|^)"', line[1:-1]):
1445                     raise IOError(
1446                         'Syntax error in po file %s(line %s): '
1447                         'unescaped double quote found' % (fpath, self.current_line)
1448                     )
1449                 self.current_token = line
1450                 self.process(keywords[tokens[0]])
1451                 continue
1452
1453             self.current_token = line
1454
1455             if tokens[0] == '#:':
1456                 if nb_tokens <= 1:
1457                     continue
1458                 # we are on a occurrences line
1459                 self.process('oc')
1460
1461             elif line[:1] == '"':
1462                 # we are on a continuation line
1463                 if re.search(r'([^\\]|^)"', line[1:-1]):
1464                     raise IOError(
1465                         'Syntax error in po file %s(line %s): '
1466                         'unescaped double quote found' % (fpath, self.current_line)
1467                     )
1468                 self.process('mc')
1469
1470             elif line[:7] == 'msgstr[':
1471                 # we are on a msgstr plural
1472                 self.process('mx')
1473
1474             elif tokens[0] == '#,':
1475                 if nb_tokens <= 1:
1476                     continue
1477                 # we are on a flags line
1478                 self.process('fl')
1479
1480             elif tokens[0] == '#' or tokens[0].startswith('##'):
1481                 if line == '#':
1482                     line += ' '
1483                 # we are on a translator comment line
1484                 self.process('tc')
1485
1486             elif tokens[0] == '#.':
1487                 if nb_tokens <= 1:
1488                     continue
1489                 # we are on a generated comment line
1490                 self.process('gc')
1491
1492             elif tokens[0] == '#|':
1493                 if nb_tokens <= 1:
1494                     raise IOError(
1495                         'Syntax error in po file %s(line %s)'
1496                         % (fpath, self.current_line)
1497                     )
1498
1499                 # Remove the marker and any whitespace right after that.
1500                 line = line[2:].lstrip()
1501                 self.current_token = line
1502
1503                 if tokens[1].startswith('"'):
1504                     # Continuation of previous metadata.
1505                     self.process('mc')
1506                     continue
1507
1508                 if nb_tokens == 2:
1509                     # Invalid continuation line.
1510                     raise IOError(
1511                         'Syntax error in po file %s(line %s): '
1512                         'invalid continuation line' % (fpath, self.current_line)
1513                     )
1514
1515                 # we are on a "previous translation" comment line,
1516                 if tokens[1] not in prev_keywords:
1517                     # Unknown keyword in previous translation comment.
1518                     raise IOError(
1519                         'Syntax error in po file %s(line %s): '
1520                         'unknown keyword %s' % (fpath, self.current_line, tokens[1])
1521                     )
1522
1523                 # Remove the keyword and any whitespace
1524                 # between it and the starting quote.
1525                 line = line[len(tokens[1]) :].lstrip()
1526                 self.current_token = line
1527                 self.process(prev_keywords[tokens[1]])
1528
1529             else:
1530                 raise IOError(
1531                     'Syntax error in po file %s(line %s)' % (fpath, self.current_line)
1532                 )
1533
1534         if self.current_entry and len(tokens) > 0 and not tokens[0].startswith('#'):
1535             # since entries are added when another entry is found, we must add
1536             # the last entry here (only if there are lines). Trailing comments
1537             # are ignored
1538             self.instance.append(self.current_entry)
1539
1540         # before returning the instance, check if there's metadata and if
1541         # so extract it in a dict
1542         metadataentry = self.instance.find('')
1543         if metadataentry:  # metadata found
1544             # remove the entry
1545             self.instance.remove(metadataentry)
1546             self.instance.metadata_is_fuzzy = metadataentry.flags
1547             key = None
1548             for msg in metadataentry.msgstr.splitlines():
1549                 try:
1550                     key, val = msg.split(':', 1)
1551                     self.instance.metadata[key] = val.strip()
1552                 except (ValueError, KeyError):
1553                     if key is not None:
1554                         self.instance.metadata[key] += '\n' + msg.strip()
1555         # close opened file
1556         if not isinstance(self.fhandle, list):  # must be file
1557             self.fhandle.close()
1558         return self.instance
1559
1560     def add(self, symbol, states, next_state):
1561         """
1562         Add a transition to the state machine.
1563
1564         Keywords arguments:
1565
1566         ``symbol``
1567             string, the matched token (two chars symbol).
1568
1569         ``states``
1570             list, a list of states (two chars symbols).
1571
1572         ``next_state``
1573             the next state the fsm will have after the action.
1574         """
1575         for state in states:
1576             action = getattr(self, 'handle_%s' % next_state)
1577             self.transitions[(symbol, state)] = (action, next_state)
1578
1579     def process(self, symbol):
1580         """
1581         Process the transition corresponding to the current state and the
1582         symbol provided.
1583
1584         Keywords arguments:
1585
1586         ``symbol``
1587             string, the matched token (two chars symbol).
1588
1589         ``linenum``
1590             integer, the current line number of the parsed file.
1591         """
1592         try:
1593             (action, state) = self.transitions[(symbol, self.current_state)]
1594             if action():
1595                 self.current_state = state
1596         except Exception:
1597             fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1598             if hasattr(self.fhandle, 'close'):
1599                 self.fhandle.close()
1600             raise IOError(
1601                 'Syntax error in po file %s(line %s)' % (fpath, self.current_line)
1602             )
1603
1604     # state handlers
1605
1606     def handle_he(self):
1607         """Handle a header comment."""
1608         if self.instance.header != '':
1609             self.instance.header += '\n'
1610         self.instance.header += self.current_token[2:]
1611         return 1
1612
1613     def handle_tc(self):
1614         """Handle a translator comment."""
1615         if self.current_state in ['mc', 'ms', 'mx']:
1616             self.instance.append(self.current_entry)
1617             self.current_entry = POEntry(linenum=self.current_line)
1618         if self.current_entry.tcomment != '':
1619             self.current_entry.tcomment += '\n'
1620         tcomment = self.current_token.lstrip('#')
1621         if tcomment.startswith(' '):
1622             tcomment = tcomment[1:]
1623         self.current_entry.tcomment += tcomment
1624         return True
1625
1626     def handle_gc(self):
1627         """Handle a generated comment."""
1628         if self.current_state in ['mc', 'ms', 'mx']:
1629             self.instance.append(self.current_entry)
1630             self.current_entry = POEntry(linenum=self.current_line)
1631         if self.current_entry.comment != '':
1632             self.current_entry.comment += '\n'
1633         self.current_entry.comment += self.current_token[3:]
1634         return True
1635
1636     def handle_oc(self):
1637         """Handle a file:num occurrence."""
1638         if self.current_state in ['mc', 'ms', 'mx']:
1639             self.instance.append(self.current_entry)
1640             self.current_entry = POEntry(linenum=self.current_line)
1641         occurrences = self.current_token[3:].split()
1642         for occurrence in occurrences:
1643             if occurrence != '':
1644                 try:
1645                     fil, line = occurrence.rsplit(':', 1)
1646                     if not line.isdigit():
1647                         fil = occurrence
1648                         line = ''
1649                     self.current_entry.occurrences.append((fil, line))
1650                 except (ValueError, AttributeError):
1651                     self.current_entry.occurrences.append((occurrence, ''))
1652         return True
1653
1654     def handle_fl(self):
1655         """Handle a flags line."""
1656         if self.current_state in ['mc', 'ms', 'mx']:
1657             self.instance.append(self.current_entry)
1658             self.current_entry = POEntry(linenum=self.current_line)
1659         self.current_entry.flags += [
1660             c.strip() for c in self.current_token[3:].split(',')
1661         ]
1662         return True
1663
1664     def handle_pp(self):
1665         """Handle a previous msgid_plural line."""
1666         if self.current_state in ['mc', 'ms', 'mx']:
1667             self.instance.append(self.current_entry)
1668             self.current_entry = POEntry(linenum=self.current_line)
1669         self.current_entry.previous_msgid_plural = unescape(self.current_token[1:-1])
1670         return True
1671
1672     def handle_pm(self):
1673         """Handle a previous msgid line."""
1674         if self.current_state in ['mc', 'ms', 'mx']:
1675             self.instance.append(self.current_entry)
1676             self.current_entry = POEntry(linenum=self.current_line)
1677         self.current_entry.previous_msgid = unescape(self.current_token[1:-1])
1678         return True
1679
1680     def handle_pc(self):
1681         """Handle a previous msgctxt line."""
1682         if self.current_state in ['mc', 'ms', 'mx']:
1683             self.instance.append(self.current_entry)
1684             self.current_entry = POEntry(linenum=self.current_line)
1685         self.current_entry.previous_msgctxt = unescape(self.current_token[1:-1])
1686         return True
1687
1688     def handle_ct(self):
1689         """Handle a msgctxt."""
1690         if self.current_state in ['mc', 'ms', 'mx']:
1691             self.instance.append(self.current_entry)
1692             self.current_entry = POEntry(linenum=self.current_line)
1693         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1694         return True
1695
1696     def handle_mi(self):
1697         """Handle a msgid."""
1698         if self.current_state in ['mc', 'ms', 'mx']:
1699             self.instance.append(self.current_entry)
1700             self.current_entry = POEntry(linenum=self.current_line)
1701         self.current_entry.obsolete = self.entry_obsolete
1702         self.current_entry.msgid = unescape(self.current_token[1:-1])
1703         return True
1704
1705     def handle_mp(self):
1706         """Handle a msgid plural."""
1707         self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1708         return True
1709
1710     def handle_ms(self):
1711         """Handle a msgstr."""
1712         self.current_entry.msgstr = unescape(self.current_token[1:-1])
1713         return True
1714
1715     def handle_mx(self):
1716         """Handle a msgstr plural."""
1717         index = self.current_token[7]
1718         value = self.current_token[self.current_token.find('"') + 1 : -1]
1719         self.current_entry.msgstr_plural[int(index)] = unescape(value)
1720         self.msgstr_index = int(index)
1721         return True
1722
1723     def handle_mc(self):
1724         """Handle a msgid or msgstr continuation line."""
1725         token = unescape(self.current_token[1:-1])
1726         if self.current_state == 'ct':
1727             self.current_entry.msgctxt += token
1728         elif self.current_state == 'mi':
1729             self.current_entry.msgid += token
1730         elif self.current_state == 'mp':
1731             self.current_entry.msgid_plural += token
1732         elif self.current_state == 'ms':
1733             self.current_entry.msgstr += token
1734         elif self.current_state == 'mx':
1735             self.current_entry.msgstr_plural[self.msgstr_index] += token
1736         elif self.current_state == 'pp':
1737             self.current_entry.previous_msgid_plural += token
1738         elif self.current_state == 'pm':
1739             self.current_entry.previous_msgid += token
1740         elif self.current_state == 'pc':
1741             self.current_entry.previous_msgctxt += token
1742         # don't change the current state
1743         return False
1744
1745
1746 # }}}
1747 # class _MOFileParser {{{
1748
1749
1750 class _MOFileParser(object):
1751     """
1752     A class to parse binary mo files.
1753     """
1754
1755     # pylint: disable=unused-argument,redefined-outer-name
1756     def __init__(self, mofile, *_args, **kwargs):
1757         """
1758         Constructor.
1759
1760         Keyword arguments:
1761
1762         ``mofile``
1763             string, path to the mo file or its content
1764
1765         ``encoding``
1766             string, the encoding to use, defaults to ``default_encoding``
1767             global variable (optional).
1768
1769         ``check_for_duplicates``
1770             whether to check for duplicate entries when adding entries to the
1771             file (optional, default: ``False``).
1772         """
1773         if _is_file(mofile):
1774             self.fhandle = open(mofile, 'rb')
1775         else:
1776             self.fhandle = io.BytesIO(mofile)
1777
1778         klass = kwargs.get('klass')
1779         if klass is None:
1780             klass = MOFile
1781         self.instance = klass(
1782             fpath=mofile,
1783             encoding=kwargs.get('encoding', default_encoding),
1784             check_for_duplicates=kwargs.get('check_for_duplicates', False),
1785         )
1786
1787     def __del__(self):
1788         """
1789         Make sure the file is closed, this prevents warnings on unclosed file
1790         when running tests with python >= 3.2.
1791         """
1792         if self.fhandle and hasattr(self.fhandle, 'close'):
1793             self.fhandle.close()
1794
1795     def parse(self):
1796         """
1797         Build the instance with the file handle provided in the
1798         constructor.
1799         """
1800         # parse magic number
1801         magic_number = self._readbinary('<I', 4)
1802         if magic_number == MOFile.MAGIC:
1803             ii = '<II'
1804         elif magic_number == MOFile.MAGIC_SWAPPED:
1805             ii = '>II'
1806         else:
1807             raise IOError('Invalid mo file, magic number is incorrect !')
1808         self.instance.magic_number = magic_number
1809         # parse the version number and the number of strings
1810         version, numofstrings = self._readbinary(ii, 8)
1811         # from MO file format specs: "A program seeing an unexpected major
1812         # revision number should stop reading the MO file entirely"
1813         if version >> 16 not in (0, 1):
1814             raise IOError('Invalid mo file, unexpected major revision number')
1815         self.instance.version = version
1816         # original strings and translation strings hash table offset
1817         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1818         # move to msgid hash table and read length and offset of msgids
1819         self.fhandle.seek(msgids_hash_offset)
1820         msgids_index = []
1821         for i in range(numofstrings):
1822             msgids_index.append(self._readbinary(ii, 8))
1823         # move to msgstr hash table and read length and offset of msgstrs
1824         self.fhandle.seek(msgstrs_hash_offset)
1825         msgstrs_index = []
1826         for i in range(numofstrings):
1827             msgstrs_index.append(self._readbinary(ii, 8))
1828         # build entries
1829         encoding = self.instance.encoding
1830         for i in range(numofstrings):
1831             self.fhandle.seek(msgids_index[i][1])
1832             msgid = self.fhandle.read(msgids_index[i][0])
1833
1834             self.fhandle.seek(msgstrs_index[i][1])
1835             msgstr = self.fhandle.read(msgstrs_index[i][0])
1836             if i == 0 and not msgid:  # metadata
1837                 raw_metadata, metadata = msgstr.split(b('\n')), {}
1838                 for line in raw_metadata:
1839                     tokens = line.split(b(':'), 1)
1840                     if tokens[0] != b(''):
1841                         try:
1842                             k = tokens[0].decode(encoding)
1843                             v = tokens[1].decode(encoding)
1844                             metadata[k] = v.strip()
1845                         except IndexError:
1846                             metadata[k] = u('')
1847                 self.instance.metadata = metadata
1848                 continue
1849             # test if we have a plural entry
1850             msgid_tokens = msgid.split(b('\0'))
1851             if len(msgid_tokens) > 1:
1852                 entry = self._build_entry(
1853                     msgid=msgid_tokens[0],
1854                     msgid_plural=msgid_tokens[1],
1855                     msgstr_plural=dict(
1856                         (k, v) for k, v in enumerate(msgstr.split(b('\0')))
1857                     ),
1858                 )
1859             else:
1860                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1861             self.instance.append(entry)
1862         # close opened file
1863         self.fhandle.close()
1864         return self.instance
1865
1866     def _build_entry(self, msgid, msgstr=None, msgid_plural=None, msgstr_plural=None):
1867         msgctxt_msgid = msgid.split(b('\x04'))
1868         encoding = self.instance.encoding
1869         if len(msgctxt_msgid) > 1:
1870             kwargs = {
1871                 'msgctxt': msgctxt_msgid[0].decode(encoding),
1872                 'msgid': msgctxt_msgid[1].decode(encoding),
1873             }
1874         else:
1875             kwargs = {'msgid': msgid.decode(encoding)}
1876         if msgstr:
1877             kwargs['msgstr'] = msgstr.decode(encoding)
1878         if msgid_plural:
1879             kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1880         if msgstr_plural:
1881             for k in msgstr_plural:
1882                 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1883             kwargs['msgstr_plural'] = msgstr_plural
1884         return MOEntry(**kwargs)
1885
1886     def _readbinary(self, fmt, numbytes):
1887         """
1888         Private method that unpack n bytes of data using format <fmt>.
1889         It returns a tuple or a mixed value if the tuple length is 1.
1890         """
1891         content = self.fhandle.read(numbytes)
1892         tup = struct.unpack(fmt, content)
1893         if len(tup) == 1:
1894             return tup[0]
1895         return tup
1896
1897
1898 # }}}