cola/polib.py

   1 # -* coding: utf-8 -*-
   2 #
   3 # License: MIT (see extras/polib/LICENSE file provided)
   4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
   5
   6 """
   7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
   8 mo files).  You can load existing files, iterate through it's entries, add,
   9 modify entries, comments or metadata, etc. or create new po files from scratch.
  10
  11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
  12 :func:`~polib.mofile` convenience functions.
  13 """
  14 from __future__ import absolute_import, division, print_function
  15 import array
  16 import codecs
  17 import os
  18 import re
  19 import struct
  20 import sys
  21 import textwrap
  22 import io
  23
  24 from . import compat
  25
  26
  27 __author__ = 'David Jean Louis <izimobil@gmail.com>'
  28 __version__ = '1.1.1'
  29 __all__ = [
  30     'pofile',
  31     'POFile',
  32     'POEntry',
  33     'mofile',
  34     'MOFile',
  35     'MOEntry',
  36     'default_encoding',
  37     'escape',
  38     'unescape',
  39     'detect_encoding',
  40 ]
  41
  42
  43 # the default encoding to use when encoding cannot be detected
  44 default_encoding = 'utf-8'
  45
  46 # python 2/3 compatibility helpers {{{
  47
  48
  49 if sys.version_info < (3,):
  50     PY3 = False
  51     text_type = compat.ustr
  52
  53     def b(s):
  54         return s
  55
  56     def u(s):
  57         return compat.ustr(s, "unicode_escape")
  58
  59 else:
  60     PY3 = True
  61     text_type = str
  62
  63     def b(s):
  64         return s.encode("utf-8")
  65
  66     def u(s):
  67         return s
  68
  69
  70 # }}}
  71 # _pofile_or_mofile {{{
  72
  73
  74 def _pofile_or_mofile(f, filetype, **kwargs):
  75     """
  76     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
  77     honor the DRY concept.
  78     """
  79     # get the file encoding
  80     enc = kwargs.get('encoding')
  81     if enc is None:
  82         enc = detect_encoding(f, filetype == 'mofile')
  83
  84     # parse the file
  85     kls = _POFileParser if filetype == 'pofile' else _MOFileParser
  86     parser = kls(
  87         f,
  88         encoding=enc,
  89         check_for_duplicates=kwargs.get('check_for_duplicates', False),
  90         klass=kwargs.get('klass'),
  91     )
  92     instance = parser.parse()
  93     instance.wrapwidth = kwargs.get('wrapwidth', 78)
  94     return instance
  95
  96
  97 # }}}
  98 # _is_file {{{
  99
 100
 101 def _is_file(filename_or_contents):
 102     """
 103     Safely returns the value of os.path.exists(filename_or_contents).
 104
 105     Arguments:
 106
 107     ``filename_or_contents``
 108         either a filename, or a string holding the contents of some file.
 109         In the latter case, this function will always return False.
 110     """
 111     try:
 112         return os.path.isfile(filename_or_contents)
 113     except (TypeError, ValueError, UnicodeEncodeError):
 114         return False
 115
 116
 117 # }}}
 118 # function pofile() {{{
 119
 120
 121 # pylint: disable=redefined-outer-name
 122 def pofile(pofile, **kwargs):
 123     """
 124     Convenience function that parses the po or pot file ``pofile`` and returns
 125     a :class:`~polib.POFile` instance.
 126
 127     Arguments:
 128
 129     ``pofile``
 130         string, full or relative path to the po/pot file or its content (data).
 131
 132     ``wrapwidth``
 133         integer, the wrap width, only useful when the ``-w`` option was passed
 134         to xgettext (optional, default: ``78``).
 135
 136     ``encoding``
 137         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 138         encoding will be auto-detected).
 139
 140     ``check_for_duplicates``
 141         whether to check for duplicate entries when adding entries to the
 142         file (optional, default: ``False``).
 143
 144     ``klass``
 145         class which is used to instantiate the return value (optional,
 146         default: ``None``, the return value with be a :class:`~polib.POFile`
 147         instance).
 148     """
 149     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
 150
 151
 152 # }}}
 153 # function mofile() {{{
 154
 155
 156 # pylint: disable=redefined-outer-name
 157 def mofile(mofile, **kwargs):
 158     """
 159     Convenience function that parses the mo file ``mofile`` and returns a
 160     :class:`~polib.MOFile` instance.
 161
 162     Arguments:
 163
 164     ``mofile``
 165         string, full or relative path to the mo file or its content (string
 166         or bytes).
 167
 168     ``wrapwidth``
 169         integer, the wrap width, only useful when the ``-w`` option was passed
 170         to xgettext to generate the po file that was used to format the mo file
 171         (optional, default: ``78``).
 172
 173     ``encoding``
 174         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 175         encoding will be auto-detected).
 176
 177     ``check_for_duplicates``
 178         whether to check for duplicate entries when adding entries to the
 179         file (optional, default: ``False``).
 180
 181     ``klass``
 182         class which is used to instantiate the return value (optional,
 183         default: ``None``, the return value with be a :class:`~polib.POFile`
 184         instance).
 185     """
 186     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
 187
 188
 189 # }}}
 190 # function detect_encoding() {{{
 191
 192
 193 def detect_encoding(file, binary_mode=False):
 194     """
 195     Try to detect the encoding used by the ``file``. The ``file`` argument can
 196     be a PO or MO file path or a string containing the contents of the file.
 197     If the encoding cannot be detected, the function will return the value of
 198     ``default_encoding``.
 199
 200     Arguments:
 201
 202     ``file``
 203         string, full or relative path to the po/mo file or its content.
 204
 205     ``binary_mode``
 206         boolean, set this to True if ``file`` is a mo file.
 207     """
 208     PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
 209     rxt = re.compile(u(PATTERN))
 210     rxb = re.compile(b(PATTERN))
 211
 212     def charset_exists(charset):
 213         """Check whether ``charset`` is valid or not."""
 214         try:
 215             codecs.lookup(charset)
 216         except LookupError:
 217             return False
 218         return True
 219
 220     if not _is_file(file):
 221         try:
 222             match = rxt.search(file)
 223         except TypeError:
 224             match = rxb.search(file)
 225         if match:
 226             enc = match.group(1).strip()
 227             if not isinstance(enc, text_type):
 228                 enc = enc.decode('utf-8')
 229             if charset_exists(enc):
 230                 return enc
 231     else:
 232         # For PY3, always treat as binary
 233         if binary_mode or PY3:
 234             mode = 'rb'
 235             rx = rxb
 236         else:
 237             mode = 'r'
 238             rx = rxt
 239         f = open(file, mode)
 240         for line in f.readlines():
 241             match = rx.search(line)
 242             if match:
 243                 f.close()
 244                 enc = match.group(1).strip()
 245                 if not isinstance(enc, text_type):
 246                     enc = enc.decode('utf-8')
 247                 if charset_exists(enc):
 248                     return enc
 249         f.close()
 250     return default_encoding
 251
 252
 253 # }}}
 254 # function escape() {{{
 255
 256
 257 def escape(st):
 258     """
 259     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 260     the given string ``st`` and returns it.
 261     """
 262     return (
 263         st.replace('\\', r'\\')
 264         .replace('\t', r'\t')
 265         .replace('\r', r'\r')
 266         .replace('\n', r'\n')
 267         .replace('\"', r'\"')
 268     )
 269
 270
 271 # }}}
 272 # function unescape() {{{
 273
 274
 275 def unescape(st):
 276     """
 277     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 278     the given string ``st`` and returns it.
 279     """
 280
 281     def unescape_repl(m):
 282         m = m.group(1)
 283         if m == 'n':
 284             return '\n'
 285         if m == 't':
 286             return '\t'
 287         if m == 'r':
 288             return '\r'
 289         if m == '\\':
 290             return '\\'
 291         return m  # handles escaped double quote
 292
 293     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
 294
 295
 296 # }}}
 297 # function natural_sort() {{{
 298
 299
 300 def natural_sort(lst):
 301     """
 302     Sort naturally the given list.
 303     Credits: http://stackoverflow.com/a/4836734
 304     """
 305
 306     def convert(text):
 307         return int(text) if text.isdigit() else text.lower()
 308
 309     def alphanum_key(key):
 310         return [convert(c) for c in re.split('([0-9]+)', key)]
 311
 312     return sorted(lst, key=alphanum_key)
 313
 314
 315 # }}}
 316 # class _BaseFile {{{
 317
 318
 319 class _BaseFile(list):
 320     """
 321     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
 322     classes. This class should **not** be instantiated directly.
 323     """
 324
 325     def __init__(self, *_args, **kwargs):
 326         """
 327         Constructor, accepts the following keyword arguments:
 328
 329         ``pofile``
 330             string, the path to the po or mo file, or its content as a string.
 331
 332         ``wrapwidth``
 333             integer, the wrap width, only useful when the ``-w`` option was
 334             passed to xgettext (optional, default: ``78``).
 335
 336         ``encoding``
 337             string, the encoding to use, defaults to ``default_encoding``
 338             global variable (optional).
 339
 340         ``check_for_duplicates``
 341             whether to check for duplicate entries when adding entries to the
 342             file, (optional, default: ``False``).
 343         """
 344         list.__init__(self)
 345         # the opened file handle
 346         pofile = kwargs.get('pofile', None)  # pylint: disable=redefined-outer-name
 347         if pofile and _is_file(pofile):
 348             self.fpath = pofile
 349         else:
 350             self.fpath = kwargs.get('fpath')
 351         # the width at which lines should be wrapped
 352         self.wrapwidth = kwargs.get('wrapwidth', 78)
 353         # the file encoding
 354         self.encoding = kwargs.get('encoding', default_encoding)
 355         # whether to check for duplicate entries or not
 356         self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
 357         # header
 358         self.header = ''
 359         # both po and mo files have metadata
 360         self.metadata = {}
 361         self.metadata_is_fuzzy = 0
 362
 363     def __unicode__(self):
 364         """
 365         Returns the unicode representation of the file.
 366         """
 367         ret = []
 368         entries = [self.metadata_as_entry()] + [e for e in self if not e.obsolete]
 369         for entry in entries:
 370             ret.append(entry.__unicode__(self.wrapwidth))
 371         for entry in self.obsolete_entries():  # pylint: disable=no-member
 372             ret.append(entry.__unicode__(self.wrapwidth))
 373         ret = u('\n').join(ret)
 374         return ret
 375
 376     if PY3:
 377
 378         def __str__(self):
 379             return self.__unicode__()
 380
 381     else:
 382
 383         def __str__(self):
 384             """
 385             Returns the string representation of the file.
 386             """
 387             return compat.ustr(self).encode(self.encoding)
 388
 389     def __contains__(self, entry):
 390         """
 391         Overridden ``list`` method to implement the membership test (in and
 392         not in).
 393         The method considers that an entry is in the file if it finds an entry
 394         that has the same msgid (the test is **case sensitive**) and the same
 395         msgctxt (or none for both entries).
 396
 397         Argument:
 398
 399         ``entry``
 400             an instance of :class:`~polib._BaseEntry`.
 401         """
 402         return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) is not None
 403
 404     def __eq__(self, other):
 405         return str(self) == str(other)
 406
 407     def __hash__(self):
 408         return hash(str(self))
 409
 410     def append(self, entry):
 411         """
 412         Overridden method to check for duplicates entries, if a user tries to
 413         add an entry that is already in the file, the method will raise a
 414         ``ValueError`` exception.
 415
 416         Argument:
 417
 418         ``entry``
 419             an instance of :class:`~polib._BaseEntry`.
 420         """
 421         # check_for_duplicates may not be defined (yet) when unpickling.
 422         # But if pickling, we never want to check for duplicates anyway.
 423         if getattr(self, 'check_for_duplicates', False) and entry in self:
 424             raise ValueError('Entry "%s" already exists' % entry.msgid)
 425         super(_BaseFile, self).append(entry)
 426
 427     def insert(self, index, entry):
 428         """
 429         Overridden method to check for duplicates entries, if a user tries to
 430         add an entry that is already in the file, the method will raise a
 431         ``ValueError`` exception.
 432
 433         Arguments:
 434
 435         ``index``
 436             index at which the entry should be inserted.
 437
 438         ``entry``
 439             an instance of :class:`~polib._BaseEntry`.
 440         """
 441         if self.check_for_duplicates and entry in self:
 442             raise ValueError('Entry "%s" already exists' % entry.msgid)
 443         super(_BaseFile, self).insert(index, entry)
 444
 445     def metadata_as_entry(self):
 446         """
 447         Returns the file metadata as a :class:`~polib.POFile` instance.
 448         """
 449         e = POEntry(msgid='')
 450         mdata = self.ordered_metadata()
 451         if mdata:
 452             strs = []
 453             for name, value in mdata:
 454                 # Strip whitespace off each line in a multi-line entry
 455                 strs.append('%s: %s' % (name, value))
 456             e.msgstr = '\n'.join(strs) + '\n'
 457         if self.metadata_is_fuzzy:
 458             e.flags.append('fuzzy')
 459         return e
 460
 461     def save(self, fpath=None, repr_method='__unicode__', newline=None):
 462         """
 463         Saves the po file to ``fpath``.
 464         If it is an existing file and no ``fpath`` is provided, then the
 465         existing file is rewritten with the modified data.
 466
 467         Keyword arguments:
 468
 469         ``fpath``
 470             string, full or relative path to the file.
 471
 472         ``repr_method``
 473             string, the method to use for output.
 474
 475         ``newline``
 476             string, controls how universal newlines works
 477         """
 478         if self.fpath is None and fpath is None:
 479             raise IOError('You must provide a file path to save() method')
 480         contents = getattr(self, repr_method)()
 481         if fpath is None:
 482             fpath = self.fpath
 483         if repr_method == 'to_binary':
 484             fhandle = open(fpath, 'wb')
 485         else:
 486             fhandle = io.open(fpath, 'w', encoding=self.encoding, newline=newline)
 487             if not isinstance(contents, text_type):
 488                 contents = contents.decode(self.encoding)
 489         fhandle.write(contents)
 490         fhandle.close()
 491         # set the file path if not set
 492         if self.fpath is None and fpath:
 493             self.fpath = fpath
 494
 495     def find(self, st, by='msgid', include_obsolete_entries=False, msgctxt=False):
 496         """
 497         Find the entry which msgid (or property identified by the ``by``
 498         argument) matches the string ``st``.
 499
 500         Keyword arguments:
 501
 502         ``st``
 503             string, the string to search for.
 504
 505         ``by``
 506             string, the property to use for comparison (default: ``msgid``).
 507
 508         ``include_obsolete_entries``
 509             boolean, whether to also search in entries that are obsolete.
 510
 511         ``msgctxt``
 512             string, allows specifying a specific message context for the
 513             search.
 514         """
 515         if include_obsolete_entries:
 516             entries = self[:]
 517         else:
 518             entries = [e for e in self if not e.obsolete]
 519         matches = []
 520         for e in entries:
 521             if getattr(e, by) == st:
 522                 if msgctxt is not False and e.msgctxt != msgctxt:
 523                     continue
 524                 matches.append(e)
 525         if len(matches) == 1:
 526             return matches[0]
 527         elif len(matches) > 1:
 528             if not msgctxt:
 529                 # find the entry with no msgctx
 530                 e = None
 531                 for m in matches:
 532                     if not m.msgctxt:
 533                         e = m
 534                 if e:
 535                     return e
 536                 # fallback to the first entry found
 537                 return matches[0]
 538         return None
 539
 540     def ordered_metadata(self):
 541         """
 542         Convenience method that returns an ordered version of the metadata
 543         dictionary. The return value is list of tuples (metadata name,
 544         metadata_value).
 545         """
 546         # copy the dict first
 547         metadata = self.metadata.copy()
 548         data_order = [
 549             'Project-Id-Version',
 550             'Report-Msgid-Bugs-To',
 551             'POT-Creation-Date',
 552             'PO-Revision-Date',
 553             'Last-Translator',
 554             'Language-Team',
 555             'Language',
 556             'MIME-Version',
 557             'Content-Type',
 558             'Content-Transfer-Encoding',
 559             'Plural-Forms',
 560         ]
 561         ordered_data = []
 562         for data in data_order:
 563             try:
 564                 value = metadata.pop(data)
 565                 ordered_data.append((data, value))
 566             except KeyError:
 567                 pass
 568         # the rest of the metadata will be alphabetically ordered since there
 569         # are no specs for this AFAIK
 570         for data in natural_sort(metadata.keys()):
 571             value = metadata[data]
 572             ordered_data.append((data, value))
 573         return ordered_data
 574
 575     def to_binary(self):
 576         """
 577         Return the binary representation of the file.
 578         """
 579         offsets = []
 580         entries = self.translated_entries()  # pylint: disable=no-member
 581
 582         # the keys are sorted in the .mo file
 583         def cmp(_self, other):  # pylint: disable=unused-variable
 584             # msgfmt compares entries with msgctxt if it exists
 585             self_msgid = _self.msgctxt or _self.msgid
 586             other_msgid = other.msgctxt or other.msgid
 587             if self_msgid > other_msgid:
 588                 return 1
 589             elif self_msgid < other_msgid:
 590                 return -1
 591             else:
 592                 return 0
 593
 594         # add metadata entry
 595         entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
 596         mentry = self.metadata_as_entry()
 597         entries = [mentry] + entries
 598         entries_len = len(entries)
 599         ids, strs = b(''), b('')
 600         for e in entries:
 601             # For each string, we need size and file offset.  Each string is
 602             # NUL terminated; the NUL does not count into the size.
 603             msgid = b('')
 604             if e.msgctxt:
 605                 # Contexts are stored by storing the concatenation of the
 606                 # context, a <EOT> byte, and the original string
 607                 msgid = self._encode(e.msgctxt + '\4')
 608             if e.msgid_plural:
 609                 msgstr = []
 610                 for index in sorted(e.msgstr_plural.keys()):
 611                     msgstr.append(e.msgstr_plural[index])
 612                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
 613                 msgstr = self._encode('\0'.join(msgstr))
 614             else:
 615                 msgid += self._encode(e.msgid)
 616                 msgstr = self._encode(e.msgstr)
 617             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
 618             ids += msgid + b('\0')
 619             strs += msgstr + b('\0')
 620
 621         # The header is 7 32-bit unsigned integers.
 622         keystart = 7 * 4 + 16 * entries_len
 623         # and the values start after the keys
 624         valuestart = keystart + len(ids)
 625         koffsets = []
 626         voffsets = []
 627         # The string table first has the list of keys, then the list of values.
 628         # Each entry has first the size of the string, then the file offset.
 629         for o1, l1, o2, l2 in offsets:
 630             koffsets += [l1, o1 + keystart]
 631             voffsets += [l2, o2 + valuestart]
 632         offsets = koffsets + voffsets
 633
 634         output = struct.pack(
 635             "Iiiiiii",
 636             # Magic number
 637             MOFile.MAGIC,
 638             # Version
 639             0,
 640             # number of entries
 641             entries_len,
 642             # start of key index
 643             7 * 4,
 644             # start of value index
 645             7 * 4 + entries_len * 8,
 646             # size and offset of hash table, we don't use hash tables
 647             0,
 648             keystart,
 649         )
 650         if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
 651             output += array.array("i", offsets).tobytes()
 652         else:
 653             output += array.array("i", offsets).tostring()  # pylint: disable=no-member
 654         output += ids
 655         output += strs
 656         return output
 657
 658     def _encode(self, mixed):
 659         """
 660         Encodes the given ``mixed`` argument with the file encoding if and
 661         only if it's an unicode string and returns the encoded string.
 662         """
 663         if isinstance(mixed, text_type):
 664             mixed = mixed.encode(self.encoding)
 665         return mixed
 666
 667
 668 # }}}
 669 # class POFile {{{
 670
 671
 672 class POFile(_BaseFile):
 673     """
 674     Po (or Pot) file reader/writer.
 675     This class inherits the :class:`~polib._BaseFile` class and, by extension,
 676     the python ``list`` type.
 677     """
 678
 679     def __unicode__(self):
 680         """
 681         Returns the unicode representation of the po file.
 682         """
 683         ret, headers = '', self.header.split('\n')
 684         for header in headers:
 685             if not header:
 686                 ret += "#\n"
 687             elif header[:1] in [',', ':']:
 688                 ret += '#%s\n' % header
 689             else:
 690                 ret += '# %s\n' % header
 691
 692         if not isinstance(ret, text_type):
 693             ret = ret.decode(self.encoding)
 694
 695         return ret + _BaseFile.__unicode__(self)
 696
 697     def save_as_mofile(self, fpath):
 698         """
 699         Saves the binary representation of the file to given ``fpath``.
 700
 701         Keyword argument:
 702
 703         ``fpath``
 704             string, full or relative path to the mo file.
 705         """
 706         _BaseFile.save(self, fpath, 'to_binary')
 707
 708     def percent_translated(self):
 709         """
 710         Convenience method that returns the percentage of translated
 711         messages.
 712         """
 713         total = len([e for e in self if not e.obsolete])
 714         if total == 0:
 715             return 100
 716         translated = len(self.translated_entries())
 717         return int(translated * 100 / float(total))
 718
 719     def translated_entries(self):
 720         """
 721         Convenience method that returns the list of translated entries.
 722         """
 723         return [e for e in self if e.translated()]
 724
 725     def untranslated_entries(self):
 726         """
 727         Convenience method that returns the list of untranslated entries.
 728         """
 729         return [
 730             e for e in self if not e.translated() and not e.obsolete and not e.fuzzy
 731         ]
 732
 733     def fuzzy_entries(self):
 734         """
 735         Convenience method that returns the list of fuzzy entries.
 736         """
 737         return [e for e in self if e.fuzzy and not e.obsolete]
 738
 739     def obsolete_entries(self):
 740         """
 741         Convenience method that returns the list of obsolete entries.
 742         """
 743         return [e for e in self if e.obsolete]
 744
 745     def merge(self, refpot):
 746         """
 747         Convenience method that merges the current pofile with the pot file
 748         provided. It behaves exactly as the gettext msgmerge utility:
 749
 750         * comments of this file will be preserved, but extracted comments and
 751           occurrences will be discarded;
 752         * any translations or comments in the file will be discarded, however,
 753           dot comments and file positions will be preserved;
 754         * the fuzzy flags are preserved.
 755
 756         Keyword argument:
 757
 758         ``refpot``
 759             object POFile, the reference catalog.
 760         """
 761         # Store entries in dict/set for faster access
 762         self_entries = dict((entry.msgid_with_context, entry) for entry in self)
 763         refpot_msgids = set(entry.msgid_with_context for entry in refpot)
 764         # Merge entries that are in the refpot
 765         for entry in refpot:
 766             e = self_entries.get(entry.msgid_with_context)
 767             if e is None:
 768                 e = POEntry()
 769                 self.append(e)
 770             e.merge(entry)
 771         # ok, now we must "obsolete" entries that are not in the refpot anymore
 772         for entry in self:
 773             if entry.msgid_with_context not in refpot_msgids:
 774                 entry.obsolete = True
 775
 776
 777 # }}}
 778 # class MOFile {{{
 779
 780
 781 class MOFile(_BaseFile):
 782     """
 783     Mo file reader/writer.
 784     This class inherits the :class:`~polib._BaseFile` class and, by
 785     extension, the python ``list`` type.
 786     """
 787
 788     MAGIC = 0x950412DE
 789     MAGIC_SWAPPED = 0xDE120495
 790
 791     def __init__(self, *args, **kwargs):
 792         """
 793         Constructor, accepts all keywords arguments accepted by
 794         :class:`~polib._BaseFile` class.
 795         """
 796         _BaseFile.__init__(self, *args, **kwargs)
 797         self.magic_number = None
 798         self.version = 0
 799
 800     def save_as_pofile(self, fpath):
 801         """
 802         Saves the mofile as a pofile to ``fpath``.
 803
 804         Keyword argument:
 805
 806         ``fpath``
 807             string, full or relative path to the file.
 808         """
 809         _BaseFile.save(self, fpath)
 810
 811     # pylint: disable=no-self-use,arguments-differ
 812     def save(self, fpath=None):
 813         """
 814         Saves the mofile to ``fpath``.
 815
 816         Keyword argument:
 817
 818         ``fpath``
 819             string, full or relative path to the file.
 820         """
 821         _BaseFile.save(self, fpath, 'to_binary')
 822
 823     # pylint: disable=no-self-use
 824     def percent_translated(self):
 825         """
 826         Convenience method to keep the same interface with POFile instances.
 827         """
 828         return 100
 829
 830     # pylint: disable=no-self-use
 831     def translated_entries(self):
 832         """
 833         Convenience method to keep the same interface with POFile instances.
 834         """
 835         return self
 836
 837     # pylint: disable=no-self-use
 838     def untranslated_entries(self):
 839         """
 840         Convenience method to keep the same interface with POFile instances.
 841         """
 842         return []
 843
 844     # pylint: disable=no-self-use
 845     def fuzzy_entries(self):
 846         """
 847         Convenience method to keep the same interface with POFile instances.
 848         """
 849         return []
 850
 851     # pylint: disable=no-self-use
 852     def obsolete_entries(self):
 853         """
 854         Convenience method to keep the same interface with POFile instances.
 855         """
 856         return []
 857
 858
 859 # }}}
 860 # class _BaseEntry {{{
 861
 862
 863 class _BaseEntry(object):
 864     """
 865     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
 866     This class should **not** be instantiated directly.
 867     """
 868
 869     def __init__(self, *_args, **kwargs):
 870         """
 871         Constructor, accepts the following keyword arguments:
 872
 873         ``msgid``
 874             string, the entry msgid.
 875
 876         ``msgstr``
 877             string, the entry msgstr.
 878
 879         ``msgid_plural``
 880             string, the entry msgid_plural.
 881
 882         ``msgstr_plural``
 883             dict, the entry msgstr_plural lines.
 884
 885         ``msgctxt``
 886             string, the entry context (msgctxt).
 887
 888         ``obsolete``
 889             bool, whether the entry is "obsolete" or not.
 890
 891         ``encoding``
 892             string, the encoding to use, defaults to ``default_encoding``
 893             global variable (optional).
 894         """
 895         self.msgid = kwargs.get('msgid', '')
 896         self.msgstr = kwargs.get('msgstr', '')
 897         self.msgid_plural = kwargs.get('msgid_plural', '')
 898         self.msgstr_plural = kwargs.get('msgstr_plural', {})
 899         self.msgctxt = kwargs.get('msgctxt', None)
 900         self.obsolete = kwargs.get('obsolete', False)
 901         self.encoding = kwargs.get('encoding', default_encoding)
 902
 903     def __unicode__(self, wrapwidth=78):
 904         """
 905         Returns the unicode representation of the entry.
 906         """
 907         if self.obsolete:
 908             delflag = '#~ '
 909         else:
 910             delflag = ''
 911         ret = []
 912         # write the msgctxt if any
 913         if self.msgctxt is not None:
 914             ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth)
 915         # write the msgid
 916         ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
 917         # write the msgid_plural if any
 918         if self.msgid_plural:
 919             ret += self._str_field(
 920                 "msgid_plural", delflag, "", self.msgid_plural, wrapwidth
 921             )
 922         if self.msgstr_plural:
 923             # write the msgstr_plural if any
 924             msgstrs = self.msgstr_plural
 925             keys = list(msgstrs)
 926             keys.sort()
 927             for index in keys:
 928                 msgstr = msgstrs[index]
 929                 plural_index = '[%s]' % index
 930                 ret += self._str_field(
 931                     "msgstr", delflag, plural_index, msgstr, wrapwidth
 932                 )
 933         else:
 934             # otherwise write the msgstr
 935             ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth)
 936         ret.append('')
 937         ret = u('\n').join(ret)
 938         return ret
 939
 940     if PY3:
 941
 942         def __str__(self):
 943             return self.__unicode__()
 944
 945     else:
 946
 947         def __str__(self):
 948             """
 949             Returns the string representation of the entry.
 950             """
 951             return compat.ustr(self).encode(self.encoding)
 952
 953     def __eq__(self, other):
 954         return str(self) == str(other)
 955
 956     def __hash__(self):
 957         return hash(str(self))
 958
 959     # pylint: disable=no-self-use
 960     def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78):
 961         lines = field.splitlines(True)
 962         if len(lines) > 1:
 963             lines = [''] + lines  # start with initial empty line
 964         else:
 965             escaped_field = escape(field)
 966             specialchars_count = 0
 967             for c in ['\\', '\n', '\r', '\t', '"']:
 968                 specialchars_count += field.count(c)
 969             # comparison must take into account fieldname length + one space
 970             # + 2 quotes (eg. msgid "<string>")
 971             flength = len(fieldname) + 3
 972             if plural_index:
 973                 flength += len(plural_index)
 974             real_wrapwidth = wrapwidth - flength + specialchars_count
 975             if wrapwidth > 0 and len(field) > real_wrapwidth:
 976                 # Wrap the line but take field name into account
 977                 lines = [''] + [
 978                     unescape(item)
 979                     for item in textwrap.wrap(
 980                         escaped_field,
 981                         wrapwidth - 2,  # 2 for quotes ""
 982                         drop_whitespace=False,
 983                         break_long_words=False,
 984                     )
 985                 ]
 986             else:
 987                 lines = [field]
 988         if fieldname.startswith('previous_'):
 989             # quick and dirty trick to get the real field name
 990             fieldname = fieldname[9:]
 991
 992         ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, escape(lines.pop(0)))]
 993         for line in lines:
 994             ret.append('%s"%s"' % (delflag, escape(line)))
 995         return ret
 996
 997     @property
 998     def msgid_with_context(self):
 999         if self.msgctxt:
1000             return '%s%s%s' % (self.msgctxt, "\x04", self.msgid)
1001         return self.msgid
1002
1003
1004 # }}}
1005 # class POEntry {{{
1006
1007
1008 class POEntry(_BaseEntry):
1009     """
1010     Represents a po file entry.
1011     """
1012
1013     def __init__(self, *args, **kwargs):
1014         """
1015         Constructor, accepts the following keyword arguments:
1016
1017         ``comment``
1018             string, the entry comment.
1019
1020         ``tcomment``
1021             string, the entry translator comment.
1022
1023         ``occurrences``
1024             list, the entry occurrences.
1025
1026         ``flags``
1027             list, the entry flags.
1028
1029         ``previous_msgctxt``
1030             string, the entry previous context.
1031
1032         ``previous_msgid``
1033             string, the entry previous msgid.
1034
1035         ``previous_msgid_plural``
1036             string, the entry previous msgid_plural.
1037
1038         ``linenum``
1039             integer, the line number of the entry
1040         """
1041         _BaseEntry.__init__(self, *args, **kwargs)
1042         self.comment = kwargs.get('comment', '')
1043         self.tcomment = kwargs.get('tcomment', '')
1044         self.occurrences = kwargs.get('occurrences', [])
1045         self.flags = kwargs.get('flags', [])
1046         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
1047         self.previous_msgid = kwargs.get('previous_msgid', None)
1048         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
1049         self.linenum = kwargs.get('linenum', None)
1050
1051     def __unicode__(self, wrapwidth=78):
1052         """
1053         Returns the unicode representation of the entry.
1054         """
1055         ret = []
1056         # comments first, if any (with text wrapping as xgettext does)
1057         if self.obsolete:
1058             comments = [('tcomment', '# ')]
1059         else:
1060             comments = [('comment', '#. '), ('tcomment', '# ')]
1061         for c in comments:
1062             val = getattr(self, c[0])
1063             if val:
1064                 for comment in val.split('\n'):
1065                     if len(comment) + len(c[1]) > wrapwidth > 0:
1066                         ret += textwrap.wrap(
1067                             comment,
1068                             wrapwidth,
1069                             initial_indent=c[1],
1070                             subsequent_indent=c[1],
1071                             break_long_words=False,
1072                         )
1073                     else:
1074                         ret.append('%s%s' % (c[1], comment))
1075
1076         # occurrences (with text wrapping as xgettext does)
1077         if not self.obsolete and self.occurrences:
1078             filelist = []
1079             for fpath, lineno in self.occurrences:
1080                 if lineno:
1081                     filelist.append('%s:%s' % (fpath, lineno))
1082                 else:
1083                     filelist.append(fpath)
1084             filestr = ' '.join(filelist)
1085             if len(filestr) + 3 > wrapwidth > 0:
1086                 # textwrap split words that contain hyphen, this is not
1087                 # what we want for filenames, so the dirty hack is to
1088                 # temporally replace hyphens with a char that a file cannot
1089                 # contain, like "*"
1090                 ret += [
1091                     line.replace('*', '-')
1092                     for line in textwrap.wrap(
1093                         filestr.replace('-', '*'),
1094                         wrapwidth,
1095                         initial_indent='#: ',
1096                         subsequent_indent='#: ',
1097                         break_long_words=False,
1098                     )
1099                 ]
1100             else:
1101                 ret.append('#: ' + filestr)
1102
1103         # flags (TODO: wrapping ?)
1104         if self.flags:
1105             ret.append('#, %s' % ', '.join(self.flags))
1106
1107         # previous context and previous msgid/msgid_plural
1108         fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
1109         if self.obsolete:
1110             prefix = "#~| "
1111         else:
1112             prefix = "#| "
1113         for f in fields:
1114             val = getattr(self, f)
1115             if val is not None:
1116                 ret += self._str_field(f, prefix, "", val, wrapwidth)
1117
1118         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1119         ret = u('\n').join(ret)
1120         return ret
1121
1122     # pylint: disable=cmp-method,too-many-return-statements
1123     def __cmp__(self, other):
1124         """
1125         Called by comparison operations if rich comparison is not defined.
1126         """
1127         # First: Obsolete test
1128         if self.obsolete != other.obsolete:
1129             if self.obsolete:
1130                 return -1
1131             else:
1132                 return 1
1133         # Work on a copy to protect original
1134         occ1 = sorted(self.occurrences[:])
1135         occ2 = sorted(other.occurrences[:])
1136         if occ1 > occ2:
1137             return 1
1138         if occ1 < occ2:
1139             return -1
1140         # Compare context
1141         msgctxt = self.msgctxt or '0'
1142         othermsgctxt = other.msgctxt or '0'
1143         if msgctxt > othermsgctxt:
1144             return 1
1145         elif msgctxt < othermsgctxt:
1146             return -1
1147         # Compare msgid_plural
1148         msgid_plural = self.msgid_plural or '0'
1149         othermsgid_plural = other.msgid_plural or '0'
1150         if msgid_plural > othermsgid_plural:
1151             return 1
1152         elif msgid_plural < othermsgid_plural:
1153             return -1
1154         # Compare msgstr_plural
1155         if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1156             msgstr_plural = list(self.msgstr_plural.values())
1157         else:
1158             msgstr_plural = []
1159         if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1160             othermsgstr_plural = list(other.msgstr_plural.values())
1161         else:
1162             othermsgstr_plural = []
1163         if msgstr_plural > othermsgstr_plural:
1164             return 1
1165         elif msgstr_plural < othermsgstr_plural:
1166             return -1
1167         # Compare msgid
1168         if self.msgid > other.msgid:
1169             return 1
1170         elif self.msgid < other.msgid:
1171             return -1
1172         # Compare msgstr
1173         if self.msgstr > other.msgstr:
1174             return 1
1175         elif self.msgstr < other.msgstr:
1176             return -1
1177         return 0
1178
1179     def __gt__(self, other):
1180         return self.__cmp__(other) > 0
1181
1182     def __lt__(self, other):
1183         return self.__cmp__(other) < 0
1184
1185     def __ge__(self, other):
1186         return self.__cmp__(other) >= 0
1187
1188     def __le__(self, other):
1189         return self.__cmp__(other) <= 0
1190
1191     def __eq__(self, other):
1192         return self.__cmp__(other) == 0
1193
1194     def __ne__(self, other):
1195         return self.__cmp__(other) != 0
1196
1197     def translated(self):
1198         """
1199         Returns ``True`` if the entry has been translated or ``False``
1200         otherwise.
1201         """
1202         if self.obsolete or self.fuzzy:
1203             return False
1204         if self.msgstr != '':
1205             return True
1206         if self.msgstr_plural:
1207             for pos in self.msgstr_plural:
1208                 if self.msgstr_plural[pos] == '':
1209                     return False
1210             return True
1211         return False
1212
1213     def merge(self, other):
1214         """
1215         Merge the current entry with the given pot entry.
1216         """
1217         self.msgid = other.msgid
1218         self.msgctxt = other.msgctxt
1219         self.occurrences = other.occurrences
1220         self.comment = other.comment
1221         fuzzy = self.fuzzy
1222         self.flags = other.flags[:]  # clone flags
1223         if fuzzy:
1224             self.flags.append('fuzzy')
1225         self.msgid_plural = other.msgid_plural
1226         self.obsolete = other.obsolete
1227         self.previous_msgctxt = other.previous_msgctxt
1228         self.previous_msgid = other.previous_msgid
1229         self.previous_msgid_plural = other.previous_msgid_plural
1230         if other.msgstr_plural:
1231             for pos in other.msgstr_plural:
1232                 try:
1233                     # keep existing translation at pos if any
1234                     self.msgstr_plural[pos]
1235                 except KeyError:
1236                     self.msgstr_plural[pos] = ''
1237
1238     @property
1239     def fuzzy(self):
1240         return 'fuzzy' in self.flags
1241
1242     def __hash__(self):
1243         return hash((self.msgid, self.msgstr))
1244
1245
1246 # }}}
1247 # class MOEntry {{{
1248
1249
1250 class MOEntry(_BaseEntry):
1251     """
1252     Represents a mo file entry.
1253     """
1254
1255     def __init__(self, *args, **kwargs):
1256         """
1257         Constructor, accepts the following keyword arguments,
1258         for consistency with :class:`~polib.POEntry`:
1259
1260         ``comment``
1261         ``tcomment``
1262         ``occurrences``
1263         ``flags``
1264         ``previous_msgctxt``
1265         ``previous_msgid``
1266         ``previous_msgid_plural``
1267
1268         Note: even though these keyword arguments are accepted,
1269         they hold no real meaning in the context of MO files
1270         and are simply ignored.
1271         """
1272         _BaseEntry.__init__(self, *args, **kwargs)
1273         self.comment = ''
1274         self.tcomment = ''
1275         self.occurrences = []
1276         self.flags = []
1277         self.previous_msgctxt = None
1278         self.previous_msgid = None
1279         self.previous_msgid_plural = None
1280
1281     def __hash__(self):
1282         return hash((self.msgid, self.msgstr))
1283
1284
1285 # }}}
1286 # class _POFileParser {{{
1287
1288
1289 class _POFileParser(object):
1290     """
1291     A finite state machine to parse efficiently and correctly po
1292     file format.
1293     """
1294
1295     # pylint: disable=redefined-outer-name
1296     def __init__(self, pofile, *_args, **kwargs):
1297         """
1298         Constructor.
1299
1300         Keyword arguments:
1301
1302         ``pofile``
1303             string, path to the po file or its content
1304
1305         ``encoding``
1306             string, the encoding to use, defaults to ``default_encoding``
1307             global variable (optional).
1308
1309         ``check_for_duplicates``
1310             whether to check for duplicate entries when adding entries to the
1311             file (optional, default: ``False``).
1312         """
1313         enc = kwargs.get('encoding', default_encoding)
1314         if _is_file(pofile):
1315             try:
1316                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1317             except LookupError:
1318                 enc = default_encoding
1319                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1320         else:
1321             self.fhandle = pofile.splitlines()
1322
1323         klass = kwargs.get('klass')
1324         if klass is None:
1325             klass = POFile
1326         self.instance = klass(
1327             pofile=pofile,
1328             encoding=enc,
1329             check_for_duplicates=kwargs.get('check_for_duplicates', False),
1330         )
1331         self.transitions = {}
1332         self.current_line = 0
1333         self.current_entry = POEntry(linenum=self.current_line)
1334         self.current_state = 'st'
1335         self.current_token = None
1336         # two memo flags used in handlers
1337         self.msgstr_index = 0
1338         self.entry_obsolete = 0
1339         # Configure the state machine, by adding transitions.
1340         # Signification of symbols:
1341         #     * ST: Beginning of the file (start)
1342         #     * HE: Header
1343         #     * TC: a translation comment
1344         #     * GC: a generated comment
1345         #     * OC: a file/line occurrence
1346         #     * FL: a flags line
1347         #     * CT: a message context
1348         #     * PC: a previous msgctxt
1349         #     * PM: a previous msgid
1350         #     * PP: a previous msgid_plural
1351         #     * MI: a msgid
1352         #     * MP: a msgid plural
1353         #     * MS: a msgstr
1354         #     * MX: a msgstr plural
1355         #     * MC: a msgid or msgstr continuation line
1356         # pylint: disable=redefined-builtin
1357         all = [
1358             'st',
1359             'he',
1360             'gc',
1361             'oc',
1362             'fl',
1363             'ct',
1364             'pc',
1365             'pm',
1366             'pp',
1367             'tc',
1368             'ms',
1369             'mp',
1370             'mx',
1371             'mi',
1372         ]
1373
1374         self.add('tc', ['st', 'he'], 'he')
1375         self.add(
1376             'tc',
1377             ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mp', 'mx', 'mi'],
1378             'tc',
1379         )
1380         self.add('gc', all, 'gc')
1381         self.add('oc', all, 'oc')
1382         self.add('fl', all, 'fl')
1383         self.add('pc', all, 'pc')
1384         self.add('pm', all, 'pm')
1385         self.add('pp', all, 'pp')
1386         self.add(
1387             'ct',
1388             ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1389             'ct',
1390         )
1391         self.add(
1392             'mi',
1393             ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', 'pm', 'pp', 'ms', 'mx'],
1394             'mi',
1395         )
1396         self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1397         self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1398         self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1399         self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1400
1401     # pylint: disable=too-many-branches
1402     def parse(self):
1403         """
1404         Run the state machine, parse the file line by line and call process()
1405         with the current matched symbol.
1406         """
1407
1408         keywords = {
1409             'msgctxt': 'ct',
1410             'msgid': 'mi',
1411             'msgstr': 'ms',
1412             'msgid_plural': 'mp',
1413         }
1414         prev_keywords = {
1415             'msgid_plural': 'pp',
1416             'msgid': 'pm',
1417             'msgctxt': 'pc',
1418         }
1419         tokens = []
1420         fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1421         for line in self.fhandle:
1422             self.current_line += 1
1423             if self.current_line == 1:
1424                 BOM = codecs.BOM_UTF8.decode('utf-8')
1425                 if line.startswith(BOM):
1426                     line = line[len(BOM) :]
1427             line = line.strip()
1428             if line == '':
1429                 continue
1430
1431             tokens = line.split(None, 2)
1432             nb_tokens = len(tokens)
1433
1434             if tokens[0] == '#~|':
1435                 continue
1436
1437             if tokens[0] == '#~' and nb_tokens > 1:
1438                 line = line[3:].strip()
1439                 tokens = tokens[1:]
1440                 nb_tokens -= 1
1441                 self.entry_obsolete = 1
1442             else:
1443                 self.entry_obsolete = 0
1444
1445             # Take care of keywords like
1446             # msgid, msgid_plural, msgctxt & msgstr.
1447             if tokens[0] in keywords and nb_tokens > 1:
1448                 line = line[len(tokens[0]) :].lstrip()
1449                 if re.search(r'([^\\]|^)"', line[1:-1]):
1450                     raise IOError(
1451                         'Syntax error in po file %s(line %s): '
1452                         'unescaped double quote found' % (fpath, self.current_line)
1453                     )
1454                 self.current_token = line
1455                 self.process(keywords[tokens[0]])
1456                 continue
1457
1458             self.current_token = line
1459
1460             if tokens[0] == '#:':
1461                 if nb_tokens <= 1:
1462                     continue
1463                 # we are on a occurrences line
1464                 self.process('oc')
1465
1466             elif line[:1] == '"':
1467                 # we are on a continuation line
1468                 if re.search(r'([^\\]|^)"', line[1:-1]):
1469                     raise IOError(
1470                         'Syntax error in po file %s(line %s): '
1471                         'unescaped double quote found' % (fpath, self.current_line)
1472                     )
1473                 self.process('mc')
1474
1475             elif line[:7] == 'msgstr[':
1476                 # we are on a msgstr plural
1477                 self.process('mx')
1478
1479             elif tokens[0] == '#,':
1480                 if nb_tokens <= 1:
1481                     continue
1482                 # we are on a flags line
1483                 self.process('fl')
1484
1485             elif tokens[0] == '#' or tokens[0].startswith('##'):
1486                 if line == '#':
1487                     line += ' '
1488                 # we are on a translator comment line
1489                 self.process('tc')
1490
1491             elif tokens[0] == '#.':
1492                 if nb_tokens <= 1:
1493                     continue
1494                 # we are on a generated comment line
1495                 self.process('gc')
1496
1497             elif tokens[0] == '#|':
1498                 if nb_tokens <= 1:
1499                     raise IOError(
1500                         'Syntax error in po file %s(line %s)'
1501                         % (fpath, self.current_line)
1502                     )
1503
1504                 # Remove the marker and any whitespace right after that.
1505                 line = line[2:].lstrip()
1506                 self.current_token = line
1507
1508                 if tokens[1].startswith('"'):
1509                     # Continuation of previous metadata.
1510                     self.process('mc')
1511                     continue
1512
1513                 if nb_tokens == 2:
1514                     # Invalid continuation line.
1515                     raise IOError(
1516                         'Syntax error in po file %s(line %s): '
1517                         'invalid continuation line' % (fpath, self.current_line)
1518                     )
1519
1520                 # we are on a "previous translation" comment line,
1521                 if tokens[1] not in prev_keywords:
1522                     # Unknown keyword in previous translation comment.
1523                     raise IOError(
1524                         'Syntax error in po file %s(line %s): '
1525                         'unknown keyword %s' % (fpath, self.current_line, tokens[1])
1526                     )
1527
1528                 # Remove the keyword and any whitespace
1529                 # between it and the starting quote.
1530                 line = line[len(tokens[1]) :].lstrip()
1531                 self.current_token = line
1532                 self.process(prev_keywords[tokens[1]])
1533
1534             else:
1535                 raise IOError(
1536                     'Syntax error in po file %s(line %s)' % (fpath, self.current_line)
1537                 )
1538
1539         if self.current_entry and len(tokens) > 0 and not tokens[0].startswith('#'):
1540             # since entries are added when another entry is found, we must add
1541             # the last entry here (only if there are lines). Trailing comments
1542             # are ignored
1543             self.instance.append(self.current_entry)
1544
1545         # before returning the instance, check if there's metadata and if
1546         # so extract it in a dict
1547         metadataentry = self.instance.find('')
1548         if metadataentry:  # metadata found
1549             # remove the entry
1550             self.instance.remove(metadataentry)
1551             self.instance.metadata_is_fuzzy = metadataentry.flags
1552             key = None
1553             for msg in metadataentry.msgstr.splitlines():
1554                 try:
1555                     key, val = msg.split(':', 1)
1556                     self.instance.metadata[key] = val.strip()
1557                 except (ValueError, KeyError):
1558                     if key is not None:
1559                         self.instance.metadata[key] += '\n' + msg.strip()
1560         # close opened file
1561         if not isinstance(self.fhandle, list):  # must be file
1562             self.fhandle.close()
1563         return self.instance
1564
1565     def add(self, symbol, states, next_state):
1566         """
1567         Add a transition to the state machine.
1568
1569         Keywords arguments:
1570
1571         ``symbol``
1572             string, the matched token (two chars symbol).
1573
1574         ``states``
1575             list, a list of states (two chars symbols).
1576
1577         ``next_state``
1578             the next state the fsm will have after the action.
1579         """
1580         for state in states:
1581             action = getattr(self, 'handle_%s' % next_state)
1582             self.transitions[(symbol, state)] = (action, next_state)
1583
1584     def process(self, symbol):
1585         """
1586         Process the transition corresponding to the current state and the
1587         symbol provided.
1588
1589         Keywords arguments:
1590
1591         ``symbol``
1592             string, the matched token (two chars symbol).
1593
1594         ``linenum``
1595             integer, the current line number of the parsed file.
1596         """
1597         try:
1598             (action, state) = self.transitions[(symbol, self.current_state)]
1599             if action():
1600                 self.current_state = state
1601         except Exception:
1602             fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1603             if hasattr(self.fhandle, 'close'):
1604                 self.fhandle.close()
1605             raise IOError(
1606                 'Syntax error in po file %s(line %s)' % (fpath, self.current_line)
1607             )
1608
1609     # state handlers
1610
1611     def handle_he(self):
1612         """Handle a header comment."""
1613         if self.instance.header != '':
1614             self.instance.header += '\n'
1615         self.instance.header += self.current_token[2:]
1616         return 1
1617
1618     def handle_tc(self):
1619         """Handle a translator comment."""
1620         if self.current_state in ['mc', 'ms', 'mx']:
1621             self.instance.append(self.current_entry)
1622             self.current_entry = POEntry(linenum=self.current_line)
1623         if self.current_entry.tcomment != '':
1624             self.current_entry.tcomment += '\n'
1625         tcomment = self.current_token.lstrip('#')
1626         if tcomment.startswith(' '):
1627             tcomment = tcomment[1:]
1628         self.current_entry.tcomment += tcomment
1629         return True
1630
1631     def handle_gc(self):
1632         """Handle a generated comment."""
1633         if self.current_state in ['mc', 'ms', 'mx']:
1634             self.instance.append(self.current_entry)
1635             self.current_entry = POEntry(linenum=self.current_line)
1636         if self.current_entry.comment != '':
1637             self.current_entry.comment += '\n'
1638         self.current_entry.comment += self.current_token[3:]
1639         return True
1640
1641     def handle_oc(self):
1642         """Handle a file:num occurrence."""
1643         if self.current_state in ['mc', 'ms', 'mx']:
1644             self.instance.append(self.current_entry)
1645             self.current_entry = POEntry(linenum=self.current_line)
1646         occurrences = self.current_token[3:].split()
1647         for occurrence in occurrences:
1648             if occurrence != '':
1649                 try:
1650                     fil, line = occurrence.rsplit(':', 1)
1651                     if not line.isdigit():
1652                         fil = occurrence
1653                         line = ''
1654                     self.current_entry.occurrences.append((fil, line))
1655                 except (ValueError, AttributeError):
1656                     self.current_entry.occurrences.append((occurrence, ''))
1657         return True
1658
1659     def handle_fl(self):
1660         """Handle a flags line."""
1661         if self.current_state in ['mc', 'ms', 'mx']:
1662             self.instance.append(self.current_entry)
1663             self.current_entry = POEntry(linenum=self.current_line)
1664         self.current_entry.flags += [
1665             c.strip() for c in self.current_token[3:].split(',')
1666         ]
1667         return True
1668
1669     def handle_pp(self):
1670         """Handle a previous msgid_plural line."""
1671         if self.current_state in ['mc', 'ms', 'mx']:
1672             self.instance.append(self.current_entry)
1673             self.current_entry = POEntry(linenum=self.current_line)
1674         self.current_entry.previous_msgid_plural = unescape(self.current_token[1:-1])
1675         return True
1676
1677     def handle_pm(self):
1678         """Handle a previous msgid line."""
1679         if self.current_state in ['mc', 'ms', 'mx']:
1680             self.instance.append(self.current_entry)
1681             self.current_entry = POEntry(linenum=self.current_line)
1682         self.current_entry.previous_msgid = unescape(self.current_token[1:-1])
1683         return True
1684
1685     def handle_pc(self):
1686         """Handle a previous msgctxt line."""
1687         if self.current_state in ['mc', 'ms', 'mx']:
1688             self.instance.append(self.current_entry)
1689             self.current_entry = POEntry(linenum=self.current_line)
1690         self.current_entry.previous_msgctxt = unescape(self.current_token[1:-1])
1691         return True
1692
1693     def handle_ct(self):
1694         """Handle a msgctxt."""
1695         if self.current_state in ['mc', 'ms', 'mx']:
1696             self.instance.append(self.current_entry)
1697             self.current_entry = POEntry(linenum=self.current_line)
1698         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1699         return True
1700
1701     def handle_mi(self):
1702         """Handle a msgid."""
1703         if self.current_state in ['mc', 'ms', 'mx']:
1704             self.instance.append(self.current_entry)
1705             self.current_entry = POEntry(linenum=self.current_line)
1706         self.current_entry.obsolete = self.entry_obsolete
1707         self.current_entry.msgid = unescape(self.current_token[1:-1])
1708         return True
1709
1710     def handle_mp(self):
1711         """Handle a msgid plural."""
1712         self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1713         return True
1714
1715     def handle_ms(self):
1716         """Handle a msgstr."""
1717         self.current_entry.msgstr = unescape(self.current_token[1:-1])
1718         return True
1719
1720     def handle_mx(self):
1721         """Handle a msgstr plural."""
1722         index = self.current_token[7]
1723         value = self.current_token[self.current_token.find('"') + 1 : -1]
1724         self.current_entry.msgstr_plural[int(index)] = unescape(value)
1725         self.msgstr_index = int(index)
1726         return True
1727
1728     def handle_mc(self):
1729         """Handle a msgid or msgstr continuation line."""
1730         token = unescape(self.current_token[1:-1])
1731         if self.current_state == 'ct':
1732             self.current_entry.msgctxt += token
1733         elif self.current_state == 'mi':
1734             self.current_entry.msgid += token
1735         elif self.current_state == 'mp':
1736             self.current_entry.msgid_plural += token
1737         elif self.current_state == 'ms':
1738             self.current_entry.msgstr += token
1739         elif self.current_state == 'mx':
1740             self.current_entry.msgstr_plural[self.msgstr_index] += token
1741         elif self.current_state == 'pp':
1742             self.current_entry.previous_msgid_plural += token
1743         elif self.current_state == 'pm':
1744             self.current_entry.previous_msgid += token
1745         elif self.current_state == 'pc':
1746             self.current_entry.previous_msgctxt += token
1747         # don't change the current state
1748         return False
1749
1750
1751 # }}}
1752 # class _MOFileParser {{{
1753
1754
1755 class _MOFileParser(object):
1756     """
1757     A class to parse binary mo files.
1758     """
1759
1760     # pylint: disable=unused-argument,redefined-outer-name
1761     def __init__(self, mofile, *_args, **kwargs):
1762         """
1763         Constructor.
1764
1765         Keyword arguments:
1766
1767         ``mofile``
1768             string, path to the mo file or its content
1769
1770         ``encoding``
1771             string, the encoding to use, defaults to ``default_encoding``
1772             global variable (optional).
1773
1774         ``check_for_duplicates``
1775             whether to check for duplicate entries when adding entries to the
1776             file (optional, default: ``False``).
1777         """
1778         if _is_file(mofile):
1779             self.fhandle = open(mofile, 'rb')
1780         else:
1781             self.fhandle = io.BytesIO(mofile)
1782
1783         klass = kwargs.get('klass')
1784         if klass is None:
1785             klass = MOFile
1786         self.instance = klass(
1787             fpath=mofile,
1788             encoding=kwargs.get('encoding', default_encoding),
1789             check_for_duplicates=kwargs.get('check_for_duplicates', False),
1790         )
1791
1792     def __del__(self):
1793         """
1794         Make sure the file is closed, this prevents warnings on unclosed file
1795         when running tests with python >= 3.2.
1796         """
1797         if self.fhandle and hasattr(self.fhandle, 'close'):
1798             self.fhandle.close()
1799
1800     def parse(self):
1801         """
1802         Build the instance with the file handle provided in the
1803         constructor.
1804         """
1805         # parse magic number
1806         magic_number = self._readbinary('<I', 4)
1807         if magic_number == MOFile.MAGIC:
1808             ii = '<II'
1809         elif magic_number == MOFile.MAGIC_SWAPPED:
1810             ii = '>II'
1811         else:
1812             raise IOError('Invalid mo file, magic number is incorrect !')
1813         self.instance.magic_number = magic_number
1814         # parse the version number and the number of strings
1815         version, numofstrings = self._readbinary(ii, 8)
1816         # from MO file format specs: "A program seeing an unexpected major
1817         # revision number should stop reading the MO file entirely"
1818         if version >> 16 not in (0, 1):
1819             raise IOError('Invalid mo file, unexpected major revision number')
1820         self.instance.version = version
1821         # original strings and translation strings hash table offset
1822         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1823         # move to msgid hash table and read length and offset of msgids
1824         self.fhandle.seek(msgids_hash_offset)
1825         msgids_index = []
1826         for i in range(numofstrings):
1827             msgids_index.append(self._readbinary(ii, 8))
1828         # move to msgstr hash table and read length and offset of msgstrs
1829         self.fhandle.seek(msgstrs_hash_offset)
1830         msgstrs_index = []
1831         for i in range(numofstrings):
1832             msgstrs_index.append(self._readbinary(ii, 8))
1833         # build entries
1834         encoding = self.instance.encoding
1835         for i in range(numofstrings):
1836             self.fhandle.seek(msgids_index[i][1])
1837             msgid = self.fhandle.read(msgids_index[i][0])
1838
1839             self.fhandle.seek(msgstrs_index[i][1])
1840             msgstr = self.fhandle.read(msgstrs_index[i][0])
1841             if i == 0 and not msgid:  # metadata
1842                 raw_metadata, metadata = msgstr.split(b('\n')), {}
1843                 for line in raw_metadata:
1844                     tokens = line.split(b(':'), 1)
1845                     if tokens[0] != b(''):
1846                         try:
1847                             k = tokens[0].decode(encoding)
1848                             v = tokens[1].decode(encoding)
1849                             metadata[k] = v.strip()
1850                         except IndexError:
1851                             metadata[k] = u('')
1852                 self.instance.metadata = metadata
1853                 continue
1854             # test if we have a plural entry
1855             msgid_tokens = msgid.split(b('\0'))
1856             if len(msgid_tokens) > 1:
1857                 entry = self._build_entry(
1858                     msgid=msgid_tokens[0],
1859                     msgid_plural=msgid_tokens[1],
1860                     msgstr_plural=dict(
1861                         (k, v) for k, v in enumerate(msgstr.split(b('\0')))
1862                     ),
1863                 )
1864             else:
1865                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1866             self.instance.append(entry)
1867         # close opened file
1868         self.fhandle.close()
1869         return self.instance
1870
1871     def _build_entry(self, msgid, msgstr=None, msgid_plural=None, msgstr_plural=None):
1872         msgctxt_msgid = msgid.split(b('\x04'))
1873         encoding = self.instance.encoding
1874         if len(msgctxt_msgid) > 1:
1875             kwargs = {
1876                 'msgctxt': msgctxt_msgid[0].decode(encoding),
1877                 'msgid': msgctxt_msgid[1].decode(encoding),
1878             }
1879         else:
1880             kwargs = {'msgid': msgid.decode(encoding)}
1881         if msgstr:
1882             kwargs['msgstr'] = msgstr.decode(encoding)
1883         if msgid_plural:
1884             kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1885         if msgstr_plural:
1886             for k in msgstr_plural:
1887                 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1888             kwargs['msgstr_plural'] = msgstr_plural
1889         return MOEntry(**kwargs)
1890
1891     def _readbinary(self, fmt, numbytes):
1892         """
1893         Private method that unpack n bytes of data using format <fmt>.
1894         It returns a tuple or a mixed value if the tuple length is 1.
1895         """
1896         content = self.fhandle.read(numbytes)
1897         tup = struct.unpack(fmt, content)
1898         if len(tup) == 1:
1899             return tup[0]
1900         return tup
1901
1902
1903 # }}}