cola/polib.py

   1 # -* coding: utf-8 -*-
   2 #
   3 # License: MIT (see LICENSE file provided)
   4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
   5
   6 """
   7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
   8 mo files).  You can load existing files, iterate through it's entries, add,
   9 modify entries, comments or metadata, etc. or create new po files from scratch.
  10
  11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
  12 :func:`~polib.mofile` convenience functions.
  13 """
  14 from __future__ import absolute_import, division, print_function
  15 import array
  16 import codecs
  17 import os
  18 import re
  19 import struct
  20 import sys
  21 import textwrap
  22 import io
  23
  24 from . import compat
  25
  26
  27 __author__ = 'David Jean Louis <izimobil@gmail.com>'
  28 __version__ = '1.1.1'
  29 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
  30            'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
  31
  32
  33 # the default encoding to use when encoding cannot be detected
  34 default_encoding = 'utf-8'
  35
  36 # python 2/3 compatibility helpers {{{
  37
  38
  39 if sys.version_info < (3,):
  40     PY3 = False
  41     text_type = compat.ustr
  42
  43     def b(s):
  44         return s
  45
  46     def u(s):
  47         return compat.ustr(s, "unicode_escape")
  48
  49 else:
  50     PY3 = True
  51     text_type = str
  52
  53     def b(s):
  54         return s.encode("utf-8")
  55
  56     def u(s):
  57         return s
  58 # }}}
  59 # _pofile_or_mofile {{{
  60
  61
  62 def _pofile_or_mofile(f, filetype, **kwargs):
  63     """
  64     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
  65     honor the DRY concept.
  66     """
  67     # get the file encoding
  68     enc = kwargs.get('encoding')
  69     if enc is None:
  70         enc = detect_encoding(f, filetype == 'mofile')
  71
  72     # parse the file
  73     kls = _POFileParser if filetype == 'pofile' else _MOFileParser
  74     parser = kls(
  75         f,
  76         encoding=enc,
  77         check_for_duplicates=kwargs.get('check_for_duplicates', False),
  78         klass=kwargs.get('klass')
  79     )
  80     instance = parser.parse()
  81     instance.wrapwidth = kwargs.get('wrapwidth', 78)
  82     return instance
  83 # }}}
  84 # _is_file {{{
  85
  86
  87 def _is_file(filename_or_contents):
  88     """
  89     Safely returns the value of os.path.exists(filename_or_contents).
  90
  91     Arguments:
  92
  93     ``filename_or_contents``
  94         either a filename, or a string holding the contents of some file.
  95         In the latter case, this function will always return False.
  96     """
  97     try:
  98         return os.path.isfile(filename_or_contents)
  99     except (TypeError, ValueError, UnicodeEncodeError):
 100         return False
 101 # }}}
 102 # function pofile() {{{
 103
 104
 105 # pylint: disable=redefined-outer-name
 106 def pofile(pofile, **kwargs):
 107     """
 108     Convenience function that parses the po or pot file ``pofile`` and returns
 109     a :class:`~polib.POFile` instance.
 110
 111     Arguments:
 112
 113     ``pofile``
 114         string, full or relative path to the po/pot file or its content (data).
 115
 116     ``wrapwidth``
 117         integer, the wrap width, only useful when the ``-w`` option was passed
 118         to xgettext (optional, default: ``78``).
 119
 120     ``encoding``
 121         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 122         encoding will be auto-detected).
 123
 124     ``check_for_duplicates``
 125         whether to check for duplicate entries when adding entries to the
 126         file (optional, default: ``False``).
 127
 128     ``klass``
 129         class which is used to instantiate the return value (optional,
 130         default: ``None``, the return value with be a :class:`~polib.POFile`
 131         instance).
 132     """
 133     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
 134 # }}}
 135 # function mofile() {{{
 136
 137
 138 # pylint: disable=redefined-outer-name
 139 def mofile(mofile, **kwargs):
 140     """
 141     Convenience function that parses the mo file ``mofile`` and returns a
 142     :class:`~polib.MOFile` instance.
 143
 144     Arguments:
 145
 146     ``mofile``
 147         string, full or relative path to the mo file or its content (string
 148         or bytes).
 149
 150     ``wrapwidth``
 151         integer, the wrap width, only useful when the ``-w`` option was passed
 152         to xgettext to generate the po file that was used to format the mo file
 153         (optional, default: ``78``).
 154
 155     ``encoding``
 156         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 157         encoding will be auto-detected).
 158
 159     ``check_for_duplicates``
 160         whether to check for duplicate entries when adding entries to the
 161         file (optional, default: ``False``).
 162
 163     ``klass``
 164         class which is used to instantiate the return value (optional,
 165         default: ``None``, the return value with be a :class:`~polib.POFile`
 166         instance).
 167     """
 168     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
 169 # }}}
 170 # function detect_encoding() {{{
 171
 172
 173 def detect_encoding(file, binary_mode=False):
 174     """
 175     Try to detect the encoding used by the ``file``. The ``file`` argument can
 176     be a PO or MO file path or a string containing the contents of the file.
 177     If the encoding cannot be detected, the function will return the value of
 178     ``default_encoding``.
 179
 180     Arguments:
 181
 182     ``file``
 183         string, full or relative path to the po/mo file or its content.
 184
 185     ``binary_mode``
 186         boolean, set this to True if ``file`` is a mo file.
 187     """
 188     PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
 189     rxt = re.compile(u(PATTERN))
 190     rxb = re.compile(b(PATTERN))
 191
 192     def charset_exists(charset):
 193         """Check whether ``charset`` is valid or not."""
 194         try:
 195             codecs.lookup(charset)
 196         except LookupError:
 197             return False
 198         return True
 199
 200     if not _is_file(file):
 201         try:
 202             match = rxt.search(file)
 203         except TypeError:
 204             match = rxb.search(file)
 205         if match:
 206             enc = match.group(1).strip()
 207             if not isinstance(enc, text_type):
 208                 enc = enc.decode('utf-8')
 209             if charset_exists(enc):
 210                 return enc
 211     else:
 212         # For PY3, always treat as binary
 213         if binary_mode or PY3:
 214             mode = 'rb'
 215             rx = rxb
 216         else:
 217             mode = 'r'
 218             rx = rxt
 219         f = open(file, mode)
 220         for line in f.readlines():
 221             match = rx.search(line)
 222             if match:
 223                 f.close()
 224                 enc = match.group(1).strip()
 225                 if not isinstance(enc, text_type):
 226                     enc = enc.decode('utf-8')
 227                 if charset_exists(enc):
 228                     return enc
 229         f.close()
 230     return default_encoding
 231 # }}}
 232 # function escape() {{{
 233
 234
 235 def escape(st):
 236     """
 237     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 238     the given string ``st`` and returns it.
 239     """
 240     return st.replace('\\', r'\\')\
 241              .replace('\t', r'\t')\
 242              .replace('\r', r'\r')\
 243              .replace('\n', r'\n')\
 244              .replace('\"', r'\"')
 245 # }}}
 246 # function unescape() {{{
 247
 248
 249 def unescape(st):
 250     """
 251     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 252     the given string ``st`` and returns it.
 253     """
 254     def unescape_repl(m):
 255         m = m.group(1)
 256         if m == 'n':
 257             return '\n'
 258         if m == 't':
 259             return '\t'
 260         if m == 'r':
 261             return '\r'
 262         if m == '\\':
 263             return '\\'
 264         return m  # handles escaped double quote
 265     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
 266 # }}}
 267 # function natural_sort() {{{
 268
 269
 270 def natural_sort(lst):
 271     """
 272     Sort naturally the given list.
 273     Credits: http://stackoverflow.com/a/4836734
 274     """
 275     def convert(text):
 276         return int(text) if text.isdigit() else text.lower()
 277
 278     def alphanum_key(key):
 279         return [convert(c) for c in re.split('([0-9]+)', key)]
 280
 281     return sorted(lst, key=alphanum_key)
 282
 283 # }}}
 284 # class _BaseFile {{{
 285
 286
 287 class _BaseFile(list):
 288     """
 289     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
 290     classes. This class should **not** be instantiated directly.
 291     """
 292
 293     def __init__(self, *_args, **kwargs):
 294         """
 295         Constructor, accepts the following keyword arguments:
 296
 297         ``pofile``
 298             string, the path to the po or mo file, or its content as a string.
 299
 300         ``wrapwidth``
 301             integer, the wrap width, only useful when the ``-w`` option was
 302             passed to xgettext (optional, default: ``78``).
 303
 304         ``encoding``
 305             string, the encoding to use, defaults to ``default_encoding``
 306             global variable (optional).
 307
 308         ``check_for_duplicates``
 309             whether to check for duplicate entries when adding entries to the
 310             file, (optional, default: ``False``).
 311         """
 312         list.__init__(self)
 313         # the opened file handle
 314         pofile = kwargs.get('pofile', None)  # pylint: disable=redefined-outer-name
 315         if pofile and _is_file(pofile):
 316             self.fpath = pofile
 317         else:
 318             self.fpath = kwargs.get('fpath')
 319         # the width at which lines should be wrapped
 320         self.wrapwidth = kwargs.get('wrapwidth', 78)
 321         # the file encoding
 322         self.encoding = kwargs.get('encoding', default_encoding)
 323         # whether to check for duplicate entries or not
 324         self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
 325         # header
 326         self.header = ''
 327         # both po and mo files have metadata
 328         self.metadata = {}
 329         self.metadata_is_fuzzy = 0
 330
 331     def __unicode__(self):
 332         """
 333         Returns the unicode representation of the file.
 334         """
 335         ret = []
 336         entries = [self.metadata_as_entry()] + \
 337                   [e for e in self if not e.obsolete]
 338         for entry in entries:
 339             ret.append(entry.__unicode__(self.wrapwidth))
 340         for entry in self.obsolete_entries():  # pylint: disable=no-member
 341             ret.append(entry.__unicode__(self.wrapwidth))
 342         ret = u('\n').join(ret)
 343         return ret
 344
 345     if PY3:
 346         def __str__(self):
 347             return self.__unicode__()
 348     else:
 349         def __str__(self):
 350             """
 351             Returns the string representation of the file.
 352             """
 353             return compat.ustr(self).encode(self.encoding)
 354
 355     def __contains__(self, entry):
 356         """
 357         Overridden ``list`` method to implement the membership test (in and
 358         not in).
 359         The method considers that an entry is in the file if it finds an entry
 360         that has the same msgid (the test is **case sensitive**) and the same
 361         msgctxt (or none for both entries).
 362
 363         Argument:
 364
 365         ``entry``
 366             an instance of :class:`~polib._BaseEntry`.
 367         """
 368         return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
 369             is not None
 370
 371     def __eq__(self, other):
 372         return str(self) == str(other)
 373
 374     def __hash__(self):
 375         return hash(str(self))
 376
 377     def append(self, entry):
 378         """
 379         Overridden method to check for duplicates entries, if a user tries to
 380         add an entry that is already in the file, the method will raise a
 381         ``ValueError`` exception.
 382
 383         Argument:
 384
 385         ``entry``
 386             an instance of :class:`~polib._BaseEntry`.
 387         """
 388         # check_for_duplicates may not be defined (yet) when unpickling.
 389         # But if pickling, we never want to check for duplicates anyway.
 390         if getattr(self, 'check_for_duplicates', False) and entry in self:
 391             raise ValueError('Entry "%s" already exists' % entry.msgid)
 392         super(_BaseFile, self).append(entry)
 393
 394     def insert(self, index, entry):
 395         """
 396         Overridden method to check for duplicates entries, if a user tries to
 397         add an entry that is already in the file, the method will raise a
 398         ``ValueError`` exception.
 399
 400         Arguments:
 401
 402         ``index``
 403             index at which the entry should be inserted.
 404
 405         ``entry``
 406             an instance of :class:`~polib._BaseEntry`.
 407         """
 408         if self.check_for_duplicates and entry in self:
 409             raise ValueError('Entry "%s" already exists' % entry.msgid)
 410         super(_BaseFile, self).insert(index, entry)
 411
 412     def metadata_as_entry(self):
 413         """
 414         Returns the file metadata as a :class:`~polib.POFile` instance.
 415         """
 416         e = POEntry(msgid='')
 417         mdata = self.ordered_metadata()
 418         if mdata:
 419             strs = []
 420             for name, value in mdata:
 421                 # Strip whitespace off each line in a multi-line entry
 422                 strs.append('%s: %s' % (name, value))
 423             e.msgstr = '\n'.join(strs) + '\n'
 424         if self.metadata_is_fuzzy:
 425             e.flags.append('fuzzy')
 426         return e
 427
 428     def save(self, fpath=None, repr_method='__unicode__', newline=None):
 429         """
 430         Saves the po file to ``fpath``.
 431         If it is an existing file and no ``fpath`` is provided, then the
 432         existing file is rewritten with the modified data.
 433
 434         Keyword arguments:
 435
 436         ``fpath``
 437             string, full or relative path to the file.
 438
 439         ``repr_method``
 440             string, the method to use for output.
 441
 442         ``newline``
 443             string, controls how universal newlines works
 444         """
 445         if self.fpath is None and fpath is None:
 446             raise IOError('You must provide a file path to save() method')
 447         contents = getattr(self, repr_method)()
 448         if fpath is None:
 449             fpath = self.fpath
 450         if repr_method == 'to_binary':
 451             fhandle = open(fpath, 'wb')
 452         else:
 453             fhandle = io.open(
 454                 fpath,
 455                 'w',
 456                 encoding=self.encoding,
 457                 newline=newline
 458             )
 459             if not isinstance(contents, text_type):
 460                 contents = contents.decode(self.encoding)
 461         fhandle.write(contents)
 462         fhandle.close()
 463         # set the file path if not set
 464         if self.fpath is None and fpath:
 465             self.fpath = fpath
 466
 467     def find(self, st, by='msgid', include_obsolete_entries=False,
 468              msgctxt=False):
 469         """
 470         Find the entry which msgid (or property identified by the ``by``
 471         argument) matches the string ``st``.
 472
 473         Keyword arguments:
 474
 475         ``st``
 476             string, the string to search for.
 477
 478         ``by``
 479             string, the property to use for comparison (default: ``msgid``).
 480
 481         ``include_obsolete_entries``
 482             boolean, whether to also search in entries that are obsolete.
 483
 484         ``msgctxt``
 485             string, allows specifying a specific message context for the
 486             search.
 487         """
 488         if include_obsolete_entries:
 489             entries = self[:]
 490         else:
 491             entries = [e for e in self if not e.obsolete]
 492         matches = []
 493         for e in entries:
 494             if getattr(e, by) == st:
 495                 if msgctxt is not False and e.msgctxt != msgctxt:
 496                     continue
 497                 matches.append(e)
 498         if len(matches) == 1:
 499             return matches[0]
 500         elif len(matches) > 1:
 501             if not msgctxt:
 502                 # find the entry with no msgctx
 503                 e = None
 504                 for m in matches:
 505                     if not m.msgctxt:
 506                         e = m
 507                 if e:
 508                     return e
 509                 # fallback to the first entry found
 510                 return matches[0]
 511         return None
 512
 513     def ordered_metadata(self):
 514         """
 515         Convenience method that returns an ordered version of the metadata
 516         dictionary. The return value is list of tuples (metadata name,
 517         metadata_value).
 518         """
 519         # copy the dict first
 520         metadata = self.metadata.copy()
 521         data_order = [
 522             'Project-Id-Version',
 523             'Report-Msgid-Bugs-To',
 524             'POT-Creation-Date',
 525             'PO-Revision-Date',
 526             'Last-Translator',
 527             'Language-Team',
 528             'Language',
 529             'MIME-Version',
 530             'Content-Type',
 531             'Content-Transfer-Encoding',
 532             'Plural-Forms'
 533         ]
 534         ordered_data = []
 535         for data in data_order:
 536             try:
 537                 value = metadata.pop(data)
 538                 ordered_data.append((data, value))
 539             except KeyError:
 540                 pass
 541         # the rest of the metadata will be alphabetically ordered since there
 542         # are no specs for this AFAIK
 543         for data in natural_sort(metadata.keys()):
 544             value = metadata[data]
 545             ordered_data.append((data, value))
 546         return ordered_data
 547
 548     def to_binary(self):
 549         """
 550         Return the binary representation of the file.
 551         """
 552         offsets = []
 553         entries = self.translated_entries()  # pylint: disable=no-member
 554
 555         # the keys are sorted in the .mo file
 556         def cmp(_self, other):  # pylint: disable=unused-variable
 557             # msgfmt compares entries with msgctxt if it exists
 558             self_msgid = _self.msgctxt or _self.msgid
 559             other_msgid = other.msgctxt or other.msgid
 560             if self_msgid > other_msgid:
 561                 return 1
 562             elif self_msgid < other_msgid:
 563                 return -1
 564             else:
 565                 return 0
 566         # add metadata entry
 567         entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
 568         mentry = self.metadata_as_entry()
 569         entries = [mentry] + entries
 570         entries_len = len(entries)
 571         ids, strs = b(''), b('')
 572         for e in entries:
 573             # For each string, we need size and file offset.  Each string is
 574             # NUL terminated; the NUL does not count into the size.
 575             msgid = b('')
 576             if e.msgctxt:
 577                 # Contexts are stored by storing the concatenation of the
 578                 # context, a <EOT> byte, and the original string
 579                 msgid = self._encode(e.msgctxt + '\4')
 580             if e.msgid_plural:
 581                 msgstr = []
 582                 for index in sorted(e.msgstr_plural.keys()):
 583                     msgstr.append(e.msgstr_plural[index])
 584                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
 585                 msgstr = self._encode('\0'.join(msgstr))
 586             else:
 587                 msgid += self._encode(e.msgid)
 588                 msgstr = self._encode(e.msgstr)
 589             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
 590             ids += msgid + b('\0')
 591             strs += msgstr + b('\0')
 592
 593         # The header is 7 32-bit unsigned integers.
 594         keystart = 7 * 4 + 16 * entries_len
 595         # and the values start after the keys
 596         valuestart = keystart + len(ids)
 597         koffsets = []
 598         voffsets = []
 599         # The string table first has the list of keys, then the list of values.
 600         # Each entry has first the size of the string, then the file offset.
 601         for o1, l1, o2, l2 in offsets:
 602             koffsets += [l1, o1 + keystart]
 603             voffsets += [l2, o2 + valuestart]
 604         offsets = koffsets + voffsets
 605
 606         output = struct.pack(
 607             "Iiiiiii",
 608             # Magic number
 609             MOFile.MAGIC,
 610             # Version
 611             0,
 612             # number of entries
 613             entries_len,
 614             # start of key index
 615             7 * 4,
 616             # start of value index
 617             7 * 4 + entries_len * 8,
 618             # size and offset of hash table, we don't use hash tables
 619             0, keystart
 620
 621         )
 622         if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
 623             output += array.array("i", offsets).tobytes()
 624         else:
 625             output += array.array("i", offsets).tostring()  # pylint: disable=no-member
 626         output += ids
 627         output += strs
 628         return output
 629
 630     def _encode(self, mixed):
 631         """
 632         Encodes the given ``mixed`` argument with the file encoding if and
 633         only if it's an unicode string and returns the encoded string.
 634         """
 635         if isinstance(mixed, text_type):
 636             mixed = mixed.encode(self.encoding)
 637         return mixed
 638 # }}}
 639 # class POFile {{{
 640
 641
 642 class POFile(_BaseFile):
 643     """
 644     Po (or Pot) file reader/writer.
 645     This class inherits the :class:`~polib._BaseFile` class and, by extension,
 646     the python ``list`` type.
 647     """
 648
 649     def __unicode__(self):
 650         """
 651         Returns the unicode representation of the po file.
 652         """
 653         ret, headers = '', self.header.split('\n')
 654         for header in headers:
 655             if not header:
 656                 ret += "#\n"
 657             elif header[:1] in [',', ':']:
 658                 ret += '#%s\n' % header
 659             else:
 660                 ret += '# %s\n' % header
 661
 662         if not isinstance(ret, text_type):
 663             ret = ret.decode(self.encoding)
 664
 665         return ret + _BaseFile.__unicode__(self)
 666
 667     def save_as_mofile(self, fpath):
 668         """
 669         Saves the binary representation of the file to given ``fpath``.
 670
 671         Keyword argument:
 672
 673         ``fpath``
 674             string, full or relative path to the mo file.
 675         """
 676         _BaseFile.save(self, fpath, 'to_binary')
 677
 678     def percent_translated(self):
 679         """
 680         Convenience method that returns the percentage of translated
 681         messages.
 682         """
 683         total = len([e for e in self if not e.obsolete])
 684         if total == 0:
 685             return 100
 686         translated = len(self.translated_entries())
 687         return int(translated * 100 / float(total))
 688
 689     def translated_entries(self):
 690         """
 691         Convenience method that returns the list of translated entries.
 692         """
 693         return [e for e in self if e.translated()]
 694
 695     def untranslated_entries(self):
 696         """
 697         Convenience method that returns the list of untranslated entries.
 698         """
 699         return [e for e in self if not e.translated() and not e.obsolete
 700                 and not e.fuzzy]
 701
 702     def fuzzy_entries(self):
 703         """
 704         Convenience method that returns the list of fuzzy entries.
 705         """
 706         return [e for e in self if e.fuzzy and not e.obsolete]
 707
 708     def obsolete_entries(self):
 709         """
 710         Convenience method that returns the list of obsolete entries.
 711         """
 712         return [e for e in self if e.obsolete]
 713
 714     def merge(self, refpot):
 715         """
 716         Convenience method that merges the current pofile with the pot file
 717         provided. It behaves exactly as the gettext msgmerge utility:
 718
 719         * comments of this file will be preserved, but extracted comments and
 720           occurrences will be discarded;
 721         * any translations or comments in the file will be discarded, however,
 722           dot comments and file positions will be preserved;
 723         * the fuzzy flags are preserved.
 724
 725         Keyword argument:
 726
 727         ``refpot``
 728             object POFile, the reference catalog.
 729         """
 730         # Store entries in dict/set for faster access
 731         self_entries = dict(
 732             (entry.msgid_with_context, entry) for entry in self
 733         )
 734         refpot_msgids = set(entry.msgid_with_context for entry in refpot)
 735         # Merge entries that are in the refpot
 736         for entry in refpot:
 737             e = self_entries.get(entry.msgid_with_context)
 738             if e is None:
 739                 e = POEntry()
 740                 self.append(e)
 741             e.merge(entry)
 742         # ok, now we must "obsolete" entries that are not in the refpot anymore
 743         for entry in self:
 744             if entry.msgid_with_context not in refpot_msgids:
 745                 entry.obsolete = True
 746 # }}}
 747 # class MOFile {{{
 748
 749
 750 class MOFile(_BaseFile):
 751     """
 752     Mo file reader/writer.
 753     This class inherits the :class:`~polib._BaseFile` class and, by
 754     extension, the python ``list`` type.
 755     """
 756     MAGIC = 0x950412de
 757     MAGIC_SWAPPED = 0xde120495
 758
 759     def __init__(self, *args, **kwargs):
 760         """
 761         Constructor, accepts all keywords arguments accepted by
 762         :class:`~polib._BaseFile` class.
 763         """
 764         _BaseFile.__init__(self, *args, **kwargs)
 765         self.magic_number = None
 766         self.version = 0
 767
 768     def save_as_pofile(self, fpath):
 769         """
 770         Saves the mofile as a pofile to ``fpath``.
 771
 772         Keyword argument:
 773
 774         ``fpath``
 775             string, full or relative path to the file.
 776         """
 777         _BaseFile.save(self, fpath)
 778
 779     # pylint: disable=no-self-use,arguments-differ
 780     def save(self, fpath=None):
 781         """
 782         Saves the mofile to ``fpath``.
 783
 784         Keyword argument:
 785
 786         ``fpath``
 787             string, full or relative path to the file.
 788         """
 789         _BaseFile.save(self, fpath, 'to_binary')
 790
 791     # pylint: disable=no-self-use
 792     def percent_translated(self):
 793         """
 794         Convenience method to keep the same interface with POFile instances.
 795         """
 796         return 100
 797
 798     # pylint: disable=no-self-use
 799     def translated_entries(self):
 800         """
 801         Convenience method to keep the same interface with POFile instances.
 802         """
 803         return self
 804
 805     # pylint: disable=no-self-use
 806     def untranslated_entries(self):
 807         """
 808         Convenience method to keep the same interface with POFile instances.
 809         """
 810         return []
 811
 812     # pylint: disable=no-self-use
 813     def fuzzy_entries(self):
 814         """
 815         Convenience method to keep the same interface with POFile instances.
 816         """
 817         return []
 818
 819     # pylint: disable=no-self-use
 820     def obsolete_entries(self):
 821         """
 822         Convenience method to keep the same interface with POFile instances.
 823         """
 824         return []
 825 # }}}
 826 # class _BaseEntry {{{
 827
 828
 829 class _BaseEntry(object):
 830     """
 831     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
 832     This class should **not** be instantiated directly.
 833     """
 834
 835     def __init__(self, *_args, **kwargs):
 836         """
 837         Constructor, accepts the following keyword arguments:
 838
 839         ``msgid``
 840             string, the entry msgid.
 841
 842         ``msgstr``
 843             string, the entry msgstr.
 844
 845         ``msgid_plural``
 846             string, the entry msgid_plural.
 847
 848         ``msgstr_plural``
 849             dict, the entry msgstr_plural lines.
 850
 851         ``msgctxt``
 852             string, the entry context (msgctxt).
 853
 854         ``obsolete``
 855             bool, whether the entry is "obsolete" or not.
 856
 857         ``encoding``
 858             string, the encoding to use, defaults to ``default_encoding``
 859             global variable (optional).
 860         """
 861         self.msgid = kwargs.get('msgid', '')
 862         self.msgstr = kwargs.get('msgstr', '')
 863         self.msgid_plural = kwargs.get('msgid_plural', '')
 864         self.msgstr_plural = kwargs.get('msgstr_plural', {})
 865         self.msgctxt = kwargs.get('msgctxt', None)
 866         self.obsolete = kwargs.get('obsolete', False)
 867         self.encoding = kwargs.get('encoding', default_encoding)
 868
 869     def __unicode__(self, wrapwidth=78):
 870         """
 871         Returns the unicode representation of the entry.
 872         """
 873         if self.obsolete:
 874             delflag = '#~ '
 875         else:
 876             delflag = ''
 877         ret = []
 878         # write the msgctxt if any
 879         if self.msgctxt is not None:
 880             ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
 881                                    wrapwidth)
 882         # write the msgid
 883         ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
 884         # write the msgid_plural if any
 885         if self.msgid_plural:
 886             ret += self._str_field("msgid_plural", delflag, "",
 887                                    self.msgid_plural, wrapwidth)
 888         if self.msgstr_plural:
 889             # write the msgstr_plural if any
 890             msgstrs = self.msgstr_plural
 891             keys = list(msgstrs)
 892             keys.sort()
 893             for index in keys:
 894                 msgstr = msgstrs[index]
 895                 plural_index = '[%s]' % index
 896                 ret += self._str_field("msgstr", delflag, plural_index, msgstr,
 897                                        wrapwidth)
 898         else:
 899             # otherwise write the msgstr
 900             ret += self._str_field("msgstr", delflag, "", self.msgstr,
 901                                    wrapwidth)
 902         ret.append('')
 903         ret = u('\n').join(ret)
 904         return ret
 905
 906     if PY3:
 907         def __str__(self):
 908             return self.__unicode__()
 909     else:
 910         def __str__(self):
 911             """
 912             Returns the string representation of the entry.
 913             """
 914             return compat.ustr(self).encode(self.encoding)
 915
 916     def __eq__(self, other):
 917         return str(self) == str(other)
 918
 919     def __hash__(self):
 920         return hash(str(self))
 921
 922     # pylint: disable=no-self-use
 923     def _str_field(self, fieldname, delflag, plural_index, field,
 924                    wrapwidth=78):
 925         lines = field.splitlines(True)
 926         if len(lines) > 1:
 927             lines = [''] + lines  # start with initial empty line
 928         else:
 929             escaped_field = escape(field)
 930             specialchars_count = 0
 931             for c in ['\\', '\n', '\r', '\t', '"']:
 932                 specialchars_count += field.count(c)
 933             # comparison must take into account fieldname length + one space
 934             # + 2 quotes (eg. msgid "<string>")
 935             flength = len(fieldname) + 3
 936             if plural_index:
 937                 flength += len(plural_index)
 938             real_wrapwidth = wrapwidth - flength + specialchars_count
 939             if wrapwidth > 0 and len(field) > real_wrapwidth:
 940                 # Wrap the line but take field name into account
 941                 lines = [''] + [unescape(item) for item in textwrap.wrap(
 942                     escaped_field,
 943                     wrapwidth - 2,  # 2 for quotes ""
 944                     drop_whitespace=False,
 945                     break_long_words=False
 946                 )]
 947             else:
 948                 lines = [field]
 949         if fieldname.startswith('previous_'):
 950             # quick and dirty trick to get the real field name
 951             fieldname = fieldname[9:]
 952
 953         ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
 954                                 escape(lines.pop(0)))]
 955         for line in lines:
 956             ret.append('%s"%s"' % (delflag, escape(line)))
 957         return ret
 958
 959     @property
 960     def msgid_with_context(self):
 961         if self.msgctxt:
 962             return '%s%s%s' % (self.msgctxt, "\x04", self.msgid)
 963         return self.msgid
 964 # }}}
 965 # class POEntry {{{
 966
 967
 968 class POEntry(_BaseEntry):
 969     """
 970     Represents a po file entry.
 971     """
 972
 973     def __init__(self, *args, **kwargs):
 974         """
 975         Constructor, accepts the following keyword arguments:
 976
 977         ``comment``
 978             string, the entry comment.
 979
 980         ``tcomment``
 981             string, the entry translator comment.
 982
 983         ``occurrences``
 984             list, the entry occurrences.
 985
 986         ``flags``
 987             list, the entry flags.
 988
 989         ``previous_msgctxt``
 990             string, the entry previous context.
 991
 992         ``previous_msgid``
 993             string, the entry previous msgid.
 994
 995         ``previous_msgid_plural``
 996             string, the entry previous msgid_plural.
 997
 998         ``linenum``
 999             integer, the line number of the entry
1000         """
1001         _BaseEntry.__init__(self, *args, **kwargs)
1002         self.comment = kwargs.get('comment', '')
1003         self.tcomment = kwargs.get('tcomment', '')
1004         self.occurrences = kwargs.get('occurrences', [])
1005         self.flags = kwargs.get('flags', [])
1006         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
1007         self.previous_msgid = kwargs.get('previous_msgid', None)
1008         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
1009         self.linenum = kwargs.get('linenum', None)
1010
1011     def __unicode__(self, wrapwidth=78):
1012         """
1013         Returns the unicode representation of the entry.
1014         """
1015         ret = []
1016         # comments first, if any (with text wrapping as xgettext does)
1017         if self.obsolete:
1018             comments = [('tcomment', '# ')]
1019         else:
1020             comments = [('comment', '#. '), ('tcomment', '# ')]
1021         for c in comments:
1022             val = getattr(self, c[0])
1023             if val:
1024                 for comment in val.split('\n'):
1025                     if len(comment) + len(c[1]) > wrapwidth > 0:
1026                         ret += textwrap.wrap(
1027                             comment,
1028                             wrapwidth,
1029                             initial_indent=c[1],
1030                             subsequent_indent=c[1],
1031                             break_long_words=False
1032                         )
1033                     else:
1034                         ret.append('%s%s' % (c[1], comment))
1035
1036         # occurrences (with text wrapping as xgettext does)
1037         if not self.obsolete and self.occurrences:
1038             filelist = []
1039             for fpath, lineno in self.occurrences:
1040                 if lineno:
1041                     filelist.append('%s:%s' % (fpath, lineno))
1042                 else:
1043                     filelist.append(fpath)
1044             filestr = ' '.join(filelist)
1045             if len(filestr) + 3 > wrapwidth > 0:
1046                 # textwrap split words that contain hyphen, this is not
1047                 # what we want for filenames, so the dirty hack is to
1048                 # temporally replace hyphens with a char that a file cannot
1049                 # contain, like "*"
1050                 ret += [line.replace('*', '-') for line in textwrap.wrap(
1051                     filestr.replace('-', '*'),
1052                     wrapwidth,
1053                     initial_indent='#: ',
1054                     subsequent_indent='#: ',
1055                     break_long_words=False
1056                 )]
1057             else:
1058                 ret.append('#: ' + filestr)
1059
1060         # flags (TODO: wrapping ?)
1061         if self.flags:
1062             ret.append('#, %s' % ', '.join(self.flags))
1063
1064         # previous context and previous msgid/msgid_plural
1065         fields = ['previous_msgctxt', 'previous_msgid',
1066                   'previous_msgid_plural']
1067         if self.obsolete:
1068             prefix = "#~| "
1069         else:
1070             prefix = "#| "
1071         for f in fields:
1072             val = getattr(self, f)
1073             if val is not None:
1074                 ret += self._str_field(f, prefix, "", val, wrapwidth)
1075
1076         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1077         ret = u('\n').join(ret)
1078         return ret
1079
1080     # pylint: disable=cmp-method,too-many-return-statements
1081     def __cmp__(self, other):
1082         """
1083         Called by comparison operations if rich comparison is not defined.
1084         """
1085         # First: Obsolete test
1086         if self.obsolete != other.obsolete:
1087             if self.obsolete:
1088                 return -1
1089             else:
1090                 return 1
1091         # Work on a copy to protect original
1092         occ1 = sorted(self.occurrences[:])
1093         occ2 = sorted(other.occurrences[:])
1094         if occ1 > occ2:
1095             return 1
1096         if occ1 < occ2:
1097             return -1
1098         # Compare context
1099         msgctxt = self.msgctxt or '0'
1100         othermsgctxt = other.msgctxt or '0'
1101         if msgctxt > othermsgctxt:
1102             return 1
1103         elif msgctxt < othermsgctxt:
1104             return -1
1105         # Compare msgid_plural
1106         msgid_plural = self.msgid_plural or '0'
1107         othermsgid_plural = other.msgid_plural or '0'
1108         if msgid_plural > othermsgid_plural:
1109             return 1
1110         elif msgid_plural < othermsgid_plural:
1111             return -1
1112         # Compare msgstr_plural
1113         if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1114             msgstr_plural = list(self.msgstr_plural.values())
1115         else:
1116             msgstr_plural = []
1117         if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1118             othermsgstr_plural = list(other.msgstr_plural.values())
1119         else:
1120             othermsgstr_plural = []
1121         if msgstr_plural > othermsgstr_plural:
1122             return 1
1123         elif msgstr_plural < othermsgstr_plural:
1124             return -1
1125         # Compare msgid
1126         if self.msgid > other.msgid:
1127             return 1
1128         elif self.msgid < other.msgid:
1129             return -1
1130         # Compare msgstr
1131         if self.msgstr > other.msgstr:
1132             return 1
1133         elif self.msgstr < other.msgstr:
1134             return -1
1135         return 0
1136
1137     def __gt__(self, other):
1138         return self.__cmp__(other) > 0
1139
1140     def __lt__(self, other):
1141         return self.__cmp__(other) < 0
1142
1143     def __ge__(self, other):
1144         return self.__cmp__(other) >= 0
1145
1146     def __le__(self, other):
1147         return self.__cmp__(other) <= 0
1148
1149     def __eq__(self, other):
1150         return self.__cmp__(other) == 0
1151
1152     def __ne__(self, other):
1153         return self.__cmp__(other) != 0
1154
1155     def translated(self):
1156         """
1157         Returns ``True`` if the entry has been translated or ``False``
1158         otherwise.
1159         """
1160         if self.obsolete or self.fuzzy:
1161             return False
1162         if self.msgstr != '':
1163             return True
1164         if self.msgstr_plural:
1165             for pos in self.msgstr_plural:
1166                 if self.msgstr_plural[pos] == '':
1167                     return False
1168             return True
1169         return False
1170
1171     def merge(self, other):
1172         """
1173         Merge the current entry with the given pot entry.
1174         """
1175         self.msgid = other.msgid
1176         self.msgctxt = other.msgctxt
1177         self.occurrences = other.occurrences
1178         self.comment = other.comment
1179         fuzzy = self.fuzzy
1180         self.flags = other.flags[:]  # clone flags
1181         if fuzzy:
1182             self.flags.append('fuzzy')
1183         self.msgid_plural = other.msgid_plural
1184         self.obsolete = other.obsolete
1185         self.previous_msgctxt = other.previous_msgctxt
1186         self.previous_msgid = other.previous_msgid
1187         self.previous_msgid_plural = other.previous_msgid_plural
1188         if other.msgstr_plural:
1189             for pos in other.msgstr_plural:
1190                 try:
1191                     # keep existing translation at pos if any
1192                     self.msgstr_plural[pos]
1193                 except KeyError:
1194                     self.msgstr_plural[pos] = ''
1195
1196     @property
1197     def fuzzy(self):
1198         return 'fuzzy' in self.flags
1199
1200     def __hash__(self):
1201         return hash((self.msgid, self.msgstr))
1202 # }}}
1203 # class MOEntry {{{
1204
1205
1206 class MOEntry(_BaseEntry):
1207     """
1208     Represents a mo file entry.
1209     """
1210     def __init__(self, *args, **kwargs):
1211         """
1212         Constructor, accepts the following keyword arguments,
1213         for consistency with :class:`~polib.POEntry`:
1214
1215         ``comment``
1216         ``tcomment``
1217         ``occurrences``
1218         ``flags``
1219         ``previous_msgctxt``
1220         ``previous_msgid``
1221         ``previous_msgid_plural``
1222
1223         Note: even though these keyword arguments are accepted,
1224         they hold no real meaning in the context of MO files
1225         and are simply ignored.
1226         """
1227         _BaseEntry.__init__(self, *args, **kwargs)
1228         self.comment = ''
1229         self.tcomment = ''
1230         self.occurrences = []
1231         self.flags = []
1232         self.previous_msgctxt = None
1233         self.previous_msgid = None
1234         self.previous_msgid_plural = None
1235
1236     def __hash__(self):
1237         return hash((self.msgid, self.msgstr))
1238
1239 # }}}
1240 # class _POFileParser {{{
1241
1242
1243 class _POFileParser(object):
1244     """
1245     A finite state machine to parse efficiently and correctly po
1246     file format.
1247     """
1248
1249     # pylint: disable=redefined-outer-name
1250     def __init__(self, pofile, *_args, **kwargs):
1251         """
1252         Constructor.
1253
1254         Keyword arguments:
1255
1256         ``pofile``
1257             string, path to the po file or its content
1258
1259         ``encoding``
1260             string, the encoding to use, defaults to ``default_encoding``
1261             global variable (optional).
1262
1263         ``check_for_duplicates``
1264             whether to check for duplicate entries when adding entries to the
1265             file (optional, default: ``False``).
1266         """
1267         enc = kwargs.get('encoding', default_encoding)
1268         if _is_file(pofile):
1269             try:
1270                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1271             except LookupError:
1272                 enc = default_encoding
1273                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1274         else:
1275             self.fhandle = pofile.splitlines()
1276
1277         klass = kwargs.get('klass')
1278         if klass is None:
1279             klass = POFile
1280         self.instance = klass(
1281             pofile=pofile,
1282             encoding=enc,
1283             check_for_duplicates=kwargs.get('check_for_duplicates', False)
1284         )
1285         self.transitions = {}
1286         self.current_line = 0
1287         self.current_entry = POEntry(linenum=self.current_line)
1288         self.current_state = 'st'
1289         self.current_token = None
1290         # two memo flags used in handlers
1291         self.msgstr_index = 0
1292         self.entry_obsolete = 0
1293         # Configure the state machine, by adding transitions.
1294         # Signification of symbols:
1295         #     * ST: Beginning of the file (start)
1296         #     * HE: Header
1297         #     * TC: a translation comment
1298         #     * GC: a generated comment
1299         #     * OC: a file/line occurrence
1300         #     * FL: a flags line
1301         #     * CT: a message context
1302         #     * PC: a previous msgctxt
1303         #     * PM: a previous msgid
1304         #     * PP: a previous msgid_plural
1305         #     * MI: a msgid
1306         #     * MP: a msgid plural
1307         #     * MS: a msgstr
1308         #     * MX: a msgstr plural
1309         #     * MC: a msgid or msgstr continuation line
1310         # pylint: disable=redefined-builtin
1311         all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
1312                'ms', 'mp', 'mx', 'mi']
1313
1314         self.add('tc', ['st', 'he'],                                     'he')
1315         self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
1316                         'mp', 'mx', 'mi'],                               'tc')
1317         self.add('gc', all,                                              'gc')
1318         self.add('oc', all,                                              'oc')
1319         self.add('fl', all,                                              'fl')
1320         self.add('pc', all,                                              'pc')
1321         self.add('pm', all,                                              'pm')
1322         self.add('pp', all,                                              'pp')
1323         self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
1324                         'pp', 'ms', 'mx'],                               'ct')
1325         self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
1326                  'pm', 'pp', 'ms', 'mx'],                                'mi')
1327         self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'],             'mp')
1328         self.add('ms', ['mi', 'mp', 'tc'],                               'ms')
1329         self.add('mx', ['mi', 'mx', 'mp', 'tc'],                         'mx')
1330         self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1331
1332     # pylint: disable=too-many-branches
1333     def parse(self):
1334         """
1335         Run the state machine, parse the file line by line and call process()
1336         with the current matched symbol.
1337         """
1338
1339         keywords = {
1340             'msgctxt': 'ct',
1341             'msgid': 'mi',
1342             'msgstr': 'ms',
1343             'msgid_plural': 'mp',
1344         }
1345         prev_keywords = {
1346             'msgid_plural': 'pp',
1347             'msgid': 'pm',
1348             'msgctxt': 'pc',
1349         }
1350         tokens = []
1351         fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1352         for line in self.fhandle:
1353             self.current_line += 1
1354             if self.current_line == 1:
1355                 BOM = codecs.BOM_UTF8.decode('utf-8')
1356                 if line.startswith(BOM):
1357                     line = line[len(BOM):]
1358             line = line.strip()
1359             if line == '':
1360                 continue
1361
1362             tokens = line.split(None, 2)
1363             nb_tokens = len(tokens)
1364
1365             if tokens[0] == '#~|':
1366                 continue
1367
1368             if tokens[0] == '#~' and nb_tokens > 1:
1369                 line = line[3:].strip()
1370                 tokens = tokens[1:]
1371                 nb_tokens -= 1
1372                 self.entry_obsolete = 1
1373             else:
1374                 self.entry_obsolete = 0
1375
1376             # Take care of keywords like
1377             # msgid, msgid_plural, msgctxt & msgstr.
1378             if tokens[0] in keywords and nb_tokens > 1:
1379                 line = line[len(tokens[0]):].lstrip()
1380                 if re.search(r'([^\\]|^)"', line[1:-1]):
1381                     raise IOError('Syntax error in po file %s(line %s): '
1382                                   'unescaped double quote found' %
1383                                   (fpath, self.current_line))
1384                 self.current_token = line
1385                 self.process(keywords[tokens[0]])
1386                 continue
1387
1388             self.current_token = line
1389
1390             if tokens[0] == '#:':
1391                 if nb_tokens <= 1:
1392                     continue
1393                 # we are on a occurrences line
1394                 self.process('oc')
1395
1396             elif line[:1] == '"':
1397                 # we are on a continuation line
1398                 if re.search(r'([^\\]|^)"', line[1:-1]):
1399                     raise IOError('Syntax error in po file %s(line %s): '
1400                                   'unescaped double quote found' %
1401                                   (fpath, self.current_line))
1402                 self.process('mc')
1403
1404             elif line[:7] == 'msgstr[':
1405                 # we are on a msgstr plural
1406                 self.process('mx')
1407
1408             elif tokens[0] == '#,':
1409                 if nb_tokens <= 1:
1410                     continue
1411                 # we are on a flags line
1412                 self.process('fl')
1413
1414             elif tokens[0] == '#' or tokens[0].startswith('##'):
1415                 if line == '#':
1416                     line += ' '
1417                 # we are on a translator comment line
1418                 self.process('tc')
1419
1420             elif tokens[0] == '#.':
1421                 if nb_tokens <= 1:
1422                     continue
1423                 # we are on a generated comment line
1424                 self.process('gc')
1425
1426             elif tokens[0] == '#|':
1427                 if nb_tokens <= 1:
1428                     raise IOError('Syntax error in po file %s(line %s)' %
1429                                   (fpath, self.current_line))
1430
1431                 # Remove the marker and any whitespace right after that.
1432                 line = line[2:].lstrip()
1433                 self.current_token = line
1434
1435                 if tokens[1].startswith('"'):
1436                     # Continuation of previous metadata.
1437                     self.process('mc')
1438                     continue
1439
1440                 if nb_tokens == 2:
1441                     # Invalid continuation line.
1442                     raise IOError('Syntax error in po file %s(line %s): '
1443                                   'invalid continuation line' %
1444                                   (fpath, self.current_line))
1445
1446                 # we are on a "previous translation" comment line,
1447                 if tokens[1] not in prev_keywords:
1448                     # Unknown keyword in previous translation comment.
1449                     raise IOError('Syntax error in po file %s(line %s): '
1450                                   'unknown keyword %s' %
1451                                   (fpath, self.current_line,
1452                                    tokens[1]))
1453
1454                 # Remove the keyword and any whitespace
1455                 # between it and the starting quote.
1456                 line = line[len(tokens[1]):].lstrip()
1457                 self.current_token = line
1458                 self.process(prev_keywords[tokens[1]])
1459
1460             else:
1461                 raise IOError('Syntax error in po file %s(line %s)' %
1462                               (fpath, self.current_line))
1463
1464         if self.current_entry and len(tokens) > 0 and \
1465            not tokens[0].startswith('#'):
1466             # since entries are added when another entry is found, we must add
1467             # the last entry here (only if there are lines). Trailing comments
1468             # are ignored
1469             self.instance.append(self.current_entry)
1470
1471         # before returning the instance, check if there's metadata and if
1472         # so extract it in a dict
1473         metadataentry = self.instance.find('')
1474         if metadataentry:  # metadata found
1475             # remove the entry
1476             self.instance.remove(metadataentry)
1477             self.instance.metadata_is_fuzzy = metadataentry.flags
1478             key = None
1479             for msg in metadataentry.msgstr.splitlines():
1480                 try:
1481                     key, val = msg.split(':', 1)
1482                     self.instance.metadata[key] = val.strip()
1483                 except (ValueError, KeyError):
1484                     if key is not None:
1485                         self.instance.metadata[key] += '\n' + msg.strip()
1486         # close opened file
1487         if not isinstance(self.fhandle, list):  # must be file
1488             self.fhandle.close()
1489         return self.instance
1490
1491     def add(self, symbol, states, next_state):
1492         """
1493         Add a transition to the state machine.
1494
1495         Keywords arguments:
1496
1497         ``symbol``
1498             string, the matched token (two chars symbol).
1499
1500         ``states``
1501             list, a list of states (two chars symbols).
1502
1503         ``next_state``
1504             the next state the fsm will have after the action.
1505         """
1506         for state in states:
1507             action = getattr(self, 'handle_%s' % next_state)
1508             self.transitions[(symbol, state)] = (action, next_state)
1509
1510     def process(self, symbol):
1511         """
1512         Process the transition corresponding to the current state and the
1513         symbol provided.
1514
1515         Keywords arguments:
1516
1517         ``symbol``
1518             string, the matched token (two chars symbol).
1519
1520         ``linenum``
1521             integer, the current line number of the parsed file.
1522         """
1523         try:
1524             (action, state) = self.transitions[(symbol, self.current_state)]
1525             if action():
1526                 self.current_state = state
1527         except Exception:
1528             fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1529             if hasattr(self.fhandle, 'close'):
1530                 self.fhandle.close()
1531             raise IOError('Syntax error in po file %s(line %s)' %
1532                           (fpath, self.current_line))
1533
1534     # state handlers
1535
1536     def handle_he(self):
1537         """Handle a header comment."""
1538         if self.instance.header != '':
1539             self.instance.header += '\n'
1540         self.instance.header += self.current_token[2:]
1541         return 1
1542
1543     def handle_tc(self):
1544         """Handle a translator comment."""
1545         if self.current_state in ['mc', 'ms', 'mx']:
1546             self.instance.append(self.current_entry)
1547             self.current_entry = POEntry(linenum=self.current_line)
1548         if self.current_entry.tcomment != '':
1549             self.current_entry.tcomment += '\n'
1550         tcomment = self.current_token.lstrip('#')
1551         if tcomment.startswith(' '):
1552             tcomment = tcomment[1:]
1553         self.current_entry.tcomment += tcomment
1554         return True
1555
1556     def handle_gc(self):
1557         """Handle a generated comment."""
1558         if self.current_state in ['mc', 'ms', 'mx']:
1559             self.instance.append(self.current_entry)
1560             self.current_entry = POEntry(linenum=self.current_line)
1561         if self.current_entry.comment != '':
1562             self.current_entry.comment += '\n'
1563         self.current_entry.comment += self.current_token[3:]
1564         return True
1565
1566     def handle_oc(self):
1567         """Handle a file:num occurrence."""
1568         if self.current_state in ['mc', 'ms', 'mx']:
1569             self.instance.append(self.current_entry)
1570             self.current_entry = POEntry(linenum=self.current_line)
1571         occurrences = self.current_token[3:].split()
1572         for occurrence in occurrences:
1573             if occurrence != '':
1574                 try:
1575                     fil, line = occurrence.rsplit(':', 1)
1576                     if not line.isdigit():
1577                         fil = occurrence
1578                         line = ''
1579                     self.current_entry.occurrences.append((fil, line))
1580                 except (ValueError, AttributeError):
1581                     self.current_entry.occurrences.append((occurrence, ''))
1582         return True
1583
1584     def handle_fl(self):
1585         """Handle a flags line."""
1586         if self.current_state in ['mc', 'ms', 'mx']:
1587             self.instance.append(self.current_entry)
1588             self.current_entry = POEntry(linenum=self.current_line)
1589         self.current_entry.flags += [c.strip() for c in
1590                                      self.current_token[3:].split(',')]
1591         return True
1592
1593     def handle_pp(self):
1594         """Handle a previous msgid_plural line."""
1595         if self.current_state in ['mc', 'ms', 'mx']:
1596             self.instance.append(self.current_entry)
1597             self.current_entry = POEntry(linenum=self.current_line)
1598         self.current_entry.previous_msgid_plural = \
1599             unescape(self.current_token[1:-1])
1600         return True
1601
1602     def handle_pm(self):
1603         """Handle a previous msgid line."""
1604         if self.current_state in ['mc', 'ms', 'mx']:
1605             self.instance.append(self.current_entry)
1606             self.current_entry = POEntry(linenum=self.current_line)
1607         self.current_entry.previous_msgid = \
1608             unescape(self.current_token[1:-1])
1609         return True
1610
1611     def handle_pc(self):
1612         """Handle a previous msgctxt line."""
1613         if self.current_state in ['mc', 'ms', 'mx']:
1614             self.instance.append(self.current_entry)
1615             self.current_entry = POEntry(linenum=self.current_line)
1616         self.current_entry.previous_msgctxt = \
1617             unescape(self.current_token[1:-1])
1618         return True
1619
1620     def handle_ct(self):
1621         """Handle a msgctxt."""
1622         if self.current_state in ['mc', 'ms', 'mx']:
1623             self.instance.append(self.current_entry)
1624             self.current_entry = POEntry(linenum=self.current_line)
1625         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1626         return True
1627
1628     def handle_mi(self):
1629         """Handle a msgid."""
1630         if self.current_state in ['mc', 'ms', 'mx']:
1631             self.instance.append(self.current_entry)
1632             self.current_entry = POEntry(linenum=self.current_line)
1633         self.current_entry.obsolete = self.entry_obsolete
1634         self.current_entry.msgid = unescape(self.current_token[1:-1])
1635         return True
1636
1637     def handle_mp(self):
1638         """Handle a msgid plural."""
1639         self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1640         return True
1641
1642     def handle_ms(self):
1643         """Handle a msgstr."""
1644         self.current_entry.msgstr = unescape(self.current_token[1:-1])
1645         return True
1646
1647     def handle_mx(self):
1648         """Handle a msgstr plural."""
1649         index = self.current_token[7]
1650         value = self.current_token[self.current_token.find('"') + 1:-1]
1651         self.current_entry.msgstr_plural[int(index)] = unescape(value)
1652         self.msgstr_index = int(index)
1653         return True
1654
1655     def handle_mc(self):
1656         """Handle a msgid or msgstr continuation line."""
1657         token = unescape(self.current_token[1:-1])
1658         if self.current_state == 'ct':
1659             self.current_entry.msgctxt += token
1660         elif self.current_state == 'mi':
1661             self.current_entry.msgid += token
1662         elif self.current_state == 'mp':
1663             self.current_entry.msgid_plural += token
1664         elif self.current_state == 'ms':
1665             self.current_entry.msgstr += token
1666         elif self.current_state == 'mx':
1667             self.current_entry.msgstr_plural[self.msgstr_index] += token
1668         elif self.current_state == 'pp':
1669             self.current_entry.previous_msgid_plural += token
1670         elif self.current_state == 'pm':
1671             self.current_entry.previous_msgid += token
1672         elif self.current_state == 'pc':
1673             self.current_entry.previous_msgctxt += token
1674         # don't change the current state
1675         return False
1676 # }}}
1677 # class _MOFileParser {{{
1678
1679
1680 class _MOFileParser(object):
1681     """
1682     A class to parse binary mo files.
1683     """
1684
1685     # pylint: disable=unused-argument,redefined-outer-name
1686     def __init__(self, mofile, *_args, **kwargs):
1687         """
1688         Constructor.
1689
1690         Keyword arguments:
1691
1692         ``mofile``
1693             string, path to the mo file or its content
1694
1695         ``encoding``
1696             string, the encoding to use, defaults to ``default_encoding``
1697             global variable (optional).
1698
1699         ``check_for_duplicates``
1700             whether to check for duplicate entries when adding entries to the
1701             file (optional, default: ``False``).
1702         """
1703         if _is_file(mofile):
1704             self.fhandle = open(mofile, 'rb')
1705         else:
1706             self.fhandle = io.BytesIO(mofile)
1707
1708         klass = kwargs.get('klass')
1709         if klass is None:
1710             klass = MOFile
1711         self.instance = klass(
1712             fpath=mofile,
1713             encoding=kwargs.get('encoding', default_encoding),
1714             check_for_duplicates=kwargs.get('check_for_duplicates', False)
1715         )
1716
1717     def __del__(self):
1718         """
1719         Make sure the file is closed, this prevents warnings on unclosed file
1720         when running tests with python >= 3.2.
1721         """
1722         if self.fhandle and hasattr(self.fhandle, 'close'):
1723             self.fhandle.close()
1724
1725     def parse(self):
1726         """
1727         Build the instance with the file handle provided in the
1728         constructor.
1729         """
1730         # parse magic number
1731         magic_number = self._readbinary('<I', 4)
1732         if magic_number == MOFile.MAGIC:
1733             ii = '<II'
1734         elif magic_number == MOFile.MAGIC_SWAPPED:
1735             ii = '>II'
1736         else:
1737             raise IOError('Invalid mo file, magic number is incorrect !')
1738         self.instance.magic_number = magic_number
1739         # parse the version number and the number of strings
1740         version, numofstrings = self._readbinary(ii, 8)
1741         # from MO file format specs: "A program seeing an unexpected major
1742         # revision number should stop reading the MO file entirely"
1743         if version >> 16 not in (0, 1):
1744             raise IOError('Invalid mo file, unexpected major revision number')
1745         self.instance.version = version
1746         # original strings and translation strings hash table offset
1747         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1748         # move to msgid hash table and read length and offset of msgids
1749         self.fhandle.seek(msgids_hash_offset)
1750         msgids_index = []
1751         for i in range(numofstrings):
1752             msgids_index.append(self._readbinary(ii, 8))
1753         # move to msgstr hash table and read length and offset of msgstrs
1754         self.fhandle.seek(msgstrs_hash_offset)
1755         msgstrs_index = []
1756         for i in range(numofstrings):
1757             msgstrs_index.append(self._readbinary(ii, 8))
1758         # build entries
1759         encoding = self.instance.encoding
1760         for i in range(numofstrings):
1761             self.fhandle.seek(msgids_index[i][1])
1762             msgid = self.fhandle.read(msgids_index[i][0])
1763
1764             self.fhandle.seek(msgstrs_index[i][1])
1765             msgstr = self.fhandle.read(msgstrs_index[i][0])
1766             if i == 0 and not msgid:  # metadata
1767                 raw_metadata, metadata = msgstr.split(b('\n')), {}
1768                 for line in raw_metadata:
1769                     tokens = line.split(b(':'), 1)
1770                     if tokens[0] != b(''):
1771                         try:
1772                             k = tokens[0].decode(encoding)
1773                             v = tokens[1].decode(encoding)
1774                             metadata[k] = v.strip()
1775                         except IndexError:
1776                             metadata[k] = u('')
1777                 self.instance.metadata = metadata
1778                 continue
1779             # test if we have a plural entry
1780             msgid_tokens = msgid.split(b('\0'))
1781             if len(msgid_tokens) > 1:
1782                 entry = self._build_entry(
1783                     msgid=msgid_tokens[0],
1784                     msgid_plural=msgid_tokens[1],
1785                     msgstr_plural=dict((k, v) for k, v in
1786                                        enumerate(msgstr.split(b('\0'))))
1787                 )
1788             else:
1789                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1790             self.instance.append(entry)
1791         # close opened file
1792         self.fhandle.close()
1793         return self.instance
1794
1795     def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
1796                      msgstr_plural=None):
1797         msgctxt_msgid = msgid.split(b('\x04'))
1798         encoding = self.instance.encoding
1799         if len(msgctxt_msgid) > 1:
1800             kwargs = {
1801                 'msgctxt': msgctxt_msgid[0].decode(encoding),
1802                 'msgid': msgctxt_msgid[1].decode(encoding),
1803             }
1804         else:
1805             kwargs = {'msgid': msgid.decode(encoding)}
1806         if msgstr:
1807             kwargs['msgstr'] = msgstr.decode(encoding)
1808         if msgid_plural:
1809             kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1810         if msgstr_plural:
1811             for k in msgstr_plural:
1812                 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1813             kwargs['msgstr_plural'] = msgstr_plural
1814         return MOEntry(**kwargs)
1815
1816     def _readbinary(self, fmt, numbytes):
1817         """
1818         Private method that unpack n bytes of data using format <fmt>.
1819         It returns a tuple or a mixed value if the tuple length is 1.
1820         """
1821         content = self.fhandle.read(numbytes)
1822         tup = struct.unpack(fmt, content)
1823         if len(tup) == 1:
1824             return tup[0]
1825         return tup
1826 # }}}