cola/polib.py

   1 # -* coding: utf-8 -*-
   2 #
   3 # License: MIT (see LICENSE file provided)
   4 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
   5
   6 """
   7 **polib** allows you to manipulate, create, modify gettext files (pot, po and
   8 mo files).  You can load existing files, iterate through it's entries, add,
   9 modify entries, comments or metadata, etc. or create new po files from scratch.
  10
  11 **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and
  12 :func:`~polib.mofile` convenience functions.
  13 """
  14
  15 import array
  16 import codecs
  17 import os
  18 import re
  19 import struct
  20 import sys
  21 import textwrap
  22 import io
  23
  24
  25 __author__ = 'David Jean Louis <izimobil@gmail.com>'
  26 __version__ = '1.1.1'
  27 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
  28            'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
  29
  30
  31 # the default encoding to use when encoding cannot be detected
  32 default_encoding = 'utf-8'
  33
  34 # python 2/3 compatibility helpers {{{
  35
  36
  37 if sys.version_info < (3,):
  38     PY3 = False
  39     text_type = unicode
  40
  41     def b(s):
  42         return s
  43
  44     def u(s):
  45         return unicode(s, "unicode_escape")
  46
  47 else:
  48     PY3 = True
  49     text_type = str
  50
  51     def b(s):
  52         return s.encode("latin-1")
  53
  54     def u(s):
  55         return s
  56 # }}}
  57 # _pofile_or_mofile {{{
  58
  59
  60 def _pofile_or_mofile(f, type, **kwargs):
  61     """
  62     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
  63     honor the DRY concept.
  64     """
  65     # get the file encoding
  66     enc = kwargs.get('encoding')
  67     if enc is None:
  68         enc = detect_encoding(f, type == 'mofile')
  69
  70     # parse the file
  71     kls = type == 'pofile' and _POFileParser or _MOFileParser
  72     parser = kls(
  73         f,
  74         encoding=enc,
  75         check_for_duplicates=kwargs.get('check_for_duplicates', False),
  76         klass=kwargs.get('klass')
  77     )
  78     instance = parser.parse()
  79     instance.wrapwidth = kwargs.get('wrapwidth', 78)
  80     return instance
  81 # }}}
  82 # _is_file {{{
  83
  84
  85 def _is_file(filename_or_contents):
  86     """
  87     Safely returns the value of os.path.exists(filename_or_contents).
  88
  89     Arguments:
  90
  91     ``filename_or_contents``
  92         either a filename, or a string holding the contents of some file.
  93         In the latter case, this function will always return False.
  94     """
  95     try:
  96         return os.path.isfile(filename_or_contents)
  97     except (TypeError, ValueError, UnicodeEncodeError):
  98         return False
  99 # }}}
 100 # function pofile() {{{
 101
 102
 103 def pofile(pofile, **kwargs):
 104     """
 105     Convenience function that parses the po or pot file ``pofile`` and returns
 106     a :class:`~polib.POFile` instance.
 107
 108     Arguments:
 109
 110     ``pofile``
 111         string, full or relative path to the po/pot file or its content (data).
 112
 113     ``wrapwidth``
 114         integer, the wrap width, only useful when the ``-w`` option was passed
 115         to xgettext (optional, default: ``78``).
 116
 117     ``encoding``
 118         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 119         encoding will be auto-detected).
 120
 121     ``check_for_duplicates``
 122         whether to check for duplicate entries when adding entries to the
 123         file (optional, default: ``False``).
 124
 125     ``klass``
 126         class which is used to instantiate the return value (optional,
 127         default: ``None``, the return value with be a :class:`~polib.POFile`
 128         instance).
 129     """
 130     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
 131 # }}}
 132 # function mofile() {{{
 133
 134
 135 def mofile(mofile, **kwargs):
 136     """
 137     Convenience function that parses the mo file ``mofile`` and returns a
 138     :class:`~polib.MOFile` instance.
 139
 140     Arguments:
 141
 142     ``mofile``
 143         string, full or relative path to the mo file or its content (string
 144         or bytes).
 145
 146     ``wrapwidth``
 147         integer, the wrap width, only useful when the ``-w`` option was passed
 148         to xgettext to generate the po file that was used to format the mo file
 149         (optional, default: ``78``).
 150
 151     ``encoding``
 152         string, the encoding to use (e.g. "utf-8") (default: ``None``, the
 153         encoding will be auto-detected).
 154
 155     ``check_for_duplicates``
 156         whether to check for duplicate entries when adding entries to the
 157         file (optional, default: ``False``).
 158
 159     ``klass``
 160         class which is used to instantiate the return value (optional,
 161         default: ``None``, the return value with be a :class:`~polib.POFile`
 162         instance).
 163     """
 164     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
 165 # }}}
 166 # function detect_encoding() {{{
 167
 168
 169 def detect_encoding(file, binary_mode=False):
 170     """
 171     Try to detect the encoding used by the ``file``. The ``file`` argument can
 172     be a PO or MO file path or a string containing the contents of the file.
 173     If the encoding cannot be detected, the function will return the value of
 174     ``default_encoding``.
 175
 176     Arguments:
 177
 178     ``file``
 179         string, full or relative path to the po/mo file or its content.
 180
 181     ``binary_mode``
 182         boolean, set this to True if ``file`` is a mo file.
 183     """
 184     PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
 185     rxt = re.compile(u(PATTERN))
 186     rxb = re.compile(b(PATTERN))
 187
 188     def charset_exists(charset):
 189         """Check whether ``charset`` is valid or not."""
 190         try:
 191             codecs.lookup(charset)
 192         except LookupError:
 193             return False
 194         return True
 195
 196     if not _is_file(file):
 197         try:
 198             match = rxt.search(file)
 199         except TypeError:
 200             match = rxb.search(file)
 201         if match:
 202             enc = match.group(1).strip()
 203             if not isinstance(enc, text_type):
 204                 enc = enc.decode('utf-8')
 205             if charset_exists(enc):
 206                 return enc
 207     else:
 208         # For PY3, always treat as binary
 209         if binary_mode or PY3:
 210             mode = 'rb'
 211             rx = rxb
 212         else:
 213             mode = 'r'
 214             rx = rxt
 215         f = open(file, mode)
 216         for line in f.readlines():
 217             match = rx.search(line)
 218             if match:
 219                 f.close()
 220                 enc = match.group(1).strip()
 221                 if not isinstance(enc, text_type):
 222                     enc = enc.decode('utf-8')
 223                 if charset_exists(enc):
 224                     return enc
 225         f.close()
 226     return default_encoding
 227 # }}}
 228 # function escape() {{{
 229
 230
 231 def escape(st):
 232     """
 233     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 234     the given string ``st`` and returns it.
 235     """
 236     return st.replace('\\', r'\\')\
 237              .replace('\t', r'\t')\
 238              .replace('\r', r'\r')\
 239              .replace('\n', r'\n')\
 240              .replace('\"', r'\"')
 241 # }}}
 242 # function unescape() {{{
 243
 244
 245 def unescape(st):
 246     """
 247     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
 248     the given string ``st`` and returns it.
 249     """
 250     def unescape_repl(m):
 251         m = m.group(1)
 252         if m == 'n':
 253             return '\n'
 254         if m == 't':
 255             return '\t'
 256         if m == 'r':
 257             return '\r'
 258         if m == '\\':
 259             return '\\'
 260         return m  # handles escaped double quote
 261     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
 262 # }}}
 263 # function natural_sort() {{{
 264
 265
 266 def natural_sort(lst):
 267     """
 268     Sort naturally the given list.
 269     Credits: http://stackoverflow.com/a/4836734
 270     """
 271     def convert(text):
 272         return int(text) if text.isdigit() else text.lower()
 273
 274     def alphanum_key(key):
 275         return [convert(c) for c in re.split('([0-9]+)', key)]
 276
 277     return sorted(lst, key=alphanum_key)
 278
 279 # }}}
 280 # class _BaseFile {{{
 281
 282
 283 class _BaseFile(list):
 284     """
 285     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
 286     classes. This class should **not** be instantiated directly.
 287     """
 288
 289     def __init__(self, *args, **kwargs):
 290         """
 291         Constructor, accepts the following keyword arguments:
 292
 293         ``pofile``
 294             string, the path to the po or mo file, or its content as a string.
 295
 296         ``wrapwidth``
 297             integer, the wrap width, only useful when the ``-w`` option was
 298             passed to xgettext (optional, default: ``78``).
 299
 300         ``encoding``
 301             string, the encoding to use, defaults to ``default_encoding``
 302             global variable (optional).
 303
 304         ``check_for_duplicates``
 305             whether to check for duplicate entries when adding entries to the
 306             file, (optional, default: ``False``).
 307         """
 308         list.__init__(self)
 309         # the opened file handle
 310         pofile = kwargs.get('pofile', None)
 311         if pofile and _is_file(pofile):
 312             self.fpath = pofile
 313         else:
 314             self.fpath = kwargs.get('fpath')
 315         # the width at which lines should be wrapped
 316         self.wrapwidth = kwargs.get('wrapwidth', 78)
 317         # the file encoding
 318         self.encoding = kwargs.get('encoding', default_encoding)
 319         # whether to check for duplicate entries or not
 320         self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
 321         # header
 322         self.header = ''
 323         # both po and mo files have metadata
 324         self.metadata = {}
 325         self.metadata_is_fuzzy = 0
 326
 327     def __unicode__(self):
 328         """
 329         Returns the unicode representation of the file.
 330         """
 331         ret = []
 332         entries = [self.metadata_as_entry()] + \
 333                   [e for e in self if not e.obsolete]
 334         for entry in entries:
 335             ret.append(entry.__unicode__(self.wrapwidth))
 336         for entry in self.obsolete_entries():
 337             ret.append(entry.__unicode__(self.wrapwidth))
 338         ret = u('\n').join(ret)
 339         return ret
 340
 341     if PY3:
 342         def __str__(self):
 343             return self.__unicode__()
 344     else:
 345         def __str__(self):
 346             """
 347             Returns the string representation of the file.
 348             """
 349             return unicode(self).encode(self.encoding)
 350
 351     def __contains__(self, entry):
 352         """
 353         Overridden ``list`` method to implement the membership test (in and
 354         not in).
 355         The method considers that an entry is in the file if it finds an entry
 356         that has the same msgid (the test is **case sensitive**) and the same
 357         msgctxt (or none for both entries).
 358
 359         Argument:
 360
 361         ``entry``
 362             an instance of :class:`~polib._BaseEntry`.
 363         """
 364         return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
 365             is not None
 366
 367     def __eq__(self, other):
 368         return str(self) == str(other)
 369
 370     def append(self, entry):
 371         """
 372         Overridden method to check for duplicates entries, if a user tries to
 373         add an entry that is already in the file, the method will raise a
 374         ``ValueError`` exception.
 375
 376         Argument:
 377
 378         ``entry``
 379             an instance of :class:`~polib._BaseEntry`.
 380         """
 381         # check_for_duplicates may not be defined (yet) when unpickling.
 382         # But if pickling, we never want to check for duplicates anyway.
 383         if getattr(self, 'check_for_duplicates', False) and entry in self:
 384             raise ValueError('Entry "%s" already exists' % entry.msgid)
 385         super(_BaseFile, self).append(entry)
 386
 387     def insert(self, index, entry):
 388         """
 389         Overridden method to check for duplicates entries, if a user tries to
 390         add an entry that is already in the file, the method will raise a
 391         ``ValueError`` exception.
 392
 393         Arguments:
 394
 395         ``index``
 396             index at which the entry should be inserted.
 397
 398         ``entry``
 399             an instance of :class:`~polib._BaseEntry`.
 400         """
 401         if self.check_for_duplicates and entry in self:
 402             raise ValueError('Entry "%s" already exists' % entry.msgid)
 403         super(_BaseFile, self).insert(index, entry)
 404
 405     def metadata_as_entry(self):
 406         """
 407         Returns the file metadata as a :class:`~polib.POFile` instance.
 408         """
 409         e = POEntry(msgid='')
 410         mdata = self.ordered_metadata()
 411         if mdata:
 412             strs = []
 413             for name, value in mdata:
 414                 # Strip whitespace off each line in a multi-line entry
 415                 strs.append('%s: %s' % (name, value))
 416             e.msgstr = '\n'.join(strs) + '\n'
 417         if self.metadata_is_fuzzy:
 418             e.flags.append('fuzzy')
 419         return e
 420
 421     def save(self, fpath=None, repr_method='__unicode__', newline=None):
 422         """
 423         Saves the po file to ``fpath``.
 424         If it is an existing file and no ``fpath`` is provided, then the
 425         existing file is rewritten with the modified data.
 426
 427         Keyword arguments:
 428
 429         ``fpath``
 430             string, full or relative path to the file.
 431
 432         ``repr_method``
 433             string, the method to use for output.
 434
 435         ``newline``
 436             string, controls how universal newlines works
 437         """
 438         if self.fpath is None and fpath is None:
 439             raise IOError('You must provide a file path to save() method')
 440         contents = getattr(self, repr_method)()
 441         if fpath is None:
 442             fpath = self.fpath
 443         if repr_method == 'to_binary':
 444             fhandle = open(fpath, 'wb')
 445         else:
 446             fhandle = io.open(
 447                 fpath,
 448                 'w',
 449                 encoding=self.encoding,
 450                 newline=newline
 451             )
 452             if not isinstance(contents, text_type):
 453                 contents = contents.decode(self.encoding)
 454         fhandle.write(contents)
 455         fhandle.close()
 456         # set the file path if not set
 457         if self.fpath is None and fpath:
 458             self.fpath = fpath
 459
 460     def find(self, st, by='msgid', include_obsolete_entries=False,
 461              msgctxt=False):
 462         """
 463         Find the entry which msgid (or property identified by the ``by``
 464         argument) matches the string ``st``.
 465
 466         Keyword arguments:
 467
 468         ``st``
 469             string, the string to search for.
 470
 471         ``by``
 472             string, the property to use for comparison (default: ``msgid``).
 473
 474         ``include_obsolete_entries``
 475             boolean, whether to also search in entries that are obsolete.
 476
 477         ``msgctxt``
 478             string, allows specifying a specific message context for the
 479             search.
 480         """
 481         if include_obsolete_entries:
 482             entries = self[:]
 483         else:
 484             entries = [e for e in self if not e.obsolete]
 485         matches = []
 486         for e in entries:
 487             if getattr(e, by) == st:
 488                 if msgctxt is not False and e.msgctxt != msgctxt:
 489                     continue
 490                 matches.append(e)
 491         if len(matches) == 1:
 492             return matches[0]
 493         elif len(matches) > 1:
 494             if not msgctxt:
 495                 # find the entry with no msgctx
 496                 e = None
 497                 for m in matches:
 498                     if not m.msgctxt:
 499                         e = m
 500                 if e:
 501                     return e
 502                 # fallback to the first entry found
 503                 return matches[0]
 504         return None
 505
 506     def ordered_metadata(self):
 507         """
 508         Convenience method that returns an ordered version of the metadata
 509         dictionary. The return value is list of tuples (metadata name,
 510         metadata_value).
 511         """
 512         # copy the dict first
 513         metadata = self.metadata.copy()
 514         data_order = [
 515             'Project-Id-Version',
 516             'Report-Msgid-Bugs-To',
 517             'POT-Creation-Date',
 518             'PO-Revision-Date',
 519             'Last-Translator',
 520             'Language-Team',
 521             'Language',
 522             'MIME-Version',
 523             'Content-Type',
 524             'Content-Transfer-Encoding',
 525             'Plural-Forms'
 526         ]
 527         ordered_data = []
 528         for data in data_order:
 529             try:
 530                 value = metadata.pop(data)
 531                 ordered_data.append((data, value))
 532             except KeyError:
 533                 pass
 534         # the rest of the metadata will be alphabetically ordered since there
 535         # are no specs for this AFAIK
 536         for data in natural_sort(metadata.keys()):
 537             value = metadata[data]
 538             ordered_data.append((data, value))
 539         return ordered_data
 540
 541     def to_binary(self):
 542         """
 543         Return the binary representation of the file.
 544         """
 545         offsets = []
 546         entries = self.translated_entries()
 547
 548         # the keys are sorted in the .mo file
 549         def cmp(_self, other):
 550             # msgfmt compares entries with msgctxt if it exists
 551             self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid
 552             other_msgid = other.msgctxt and other.msgctxt or other.msgid
 553             if self_msgid > other_msgid:
 554                 return 1
 555             elif self_msgid < other_msgid:
 556                 return -1
 557             else:
 558                 return 0
 559         # add metadata entry
 560         entries.sort(key=lambda o: o.msgid_with_context.encode('utf-8'))
 561         mentry = self.metadata_as_entry()
 562         entries = [mentry] + entries
 563         entries_len = len(entries)
 564         ids, strs = b(''), b('')
 565         for e in entries:
 566             # For each string, we need size and file offset.  Each string is
 567             # NUL terminated; the NUL does not count into the size.
 568             msgid = b('')
 569             if e.msgctxt:
 570                 # Contexts are stored by storing the concatenation of the
 571                 # context, a <EOT> byte, and the original string
 572                 msgid = self._encode(e.msgctxt + '\4')
 573             if e.msgid_plural:
 574                 msgstr = []
 575                 for index in sorted(e.msgstr_plural.keys()):
 576                     msgstr.append(e.msgstr_plural[index])
 577                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
 578                 msgstr = self._encode('\0'.join(msgstr))
 579             else:
 580                 msgid += self._encode(e.msgid)
 581                 msgstr = self._encode(e.msgstr)
 582             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
 583             ids += msgid + b('\0')
 584             strs += msgstr + b('\0')
 585
 586         # The header is 7 32-bit unsigned integers.
 587         keystart = 7 * 4 + 16 * entries_len
 588         # and the values start after the keys
 589         valuestart = keystart + len(ids)
 590         koffsets = []
 591         voffsets = []
 592         # The string table first has the list of keys, then the list of values.
 593         # Each entry has first the size of the string, then the file offset.
 594         for o1, l1, o2, l2 in offsets:
 595             koffsets += [l1, o1 + keystart]
 596             voffsets += [l2, o2 + valuestart]
 597         offsets = koffsets + voffsets
 598
 599         output = struct.pack(
 600             "Iiiiiii",
 601             # Magic number
 602             MOFile.MAGIC,
 603             # Version
 604             0,
 605             # number of entries
 606             entries_len,
 607             # start of key index
 608             7 * 4,
 609             # start of value index
 610             7 * 4 + entries_len * 8,
 611             # size and offset of hash table, we don't use hash tables
 612             0, keystart
 613
 614         )
 615         if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
 616             output += array.array("i", offsets).tobytes()
 617         else:
 618             output += array.array("i", offsets).tostring()
 619         output += ids
 620         output += strs
 621         return output
 622
 623     def _encode(self, mixed):
 624         """
 625         Encodes the given ``mixed`` argument with the file encoding if and
 626         only if it's an unicode string and returns the encoded string.
 627         """
 628         if isinstance(mixed, text_type):
 629             mixed = mixed.encode(self.encoding)
 630         return mixed
 631 # }}}
 632 # class POFile {{{
 633
 634
 635 class POFile(_BaseFile):
 636     """
 637     Po (or Pot) file reader/writer.
 638     This class inherits the :class:`~polib._BaseFile` class and, by extension,
 639     the python ``list`` type.
 640     """
 641
 642     def __unicode__(self):
 643         """
 644         Returns the unicode representation of the po file.
 645         """
 646         ret, headers = '', self.header.split('\n')
 647         for header in headers:
 648             if not len(header):
 649                 ret += "#\n"
 650             elif header[:1] in [',', ':']:
 651                 ret += '#%s\n' % header
 652             else:
 653                 ret += '# %s\n' % header
 654
 655         if not isinstance(ret, text_type):
 656             ret = ret.decode(self.encoding)
 657
 658         return ret + _BaseFile.__unicode__(self)
 659
 660     def save_as_mofile(self, fpath):
 661         """
 662         Saves the binary representation of the file to given ``fpath``.
 663
 664         Keyword argument:
 665
 666         ``fpath``
 667             string, full or relative path to the mo file.
 668         """
 669         _BaseFile.save(self, fpath, 'to_binary')
 670
 671     def percent_translated(self):
 672         """
 673         Convenience method that returns the percentage of translated
 674         messages.
 675         """
 676         total = len([e for e in self if not e.obsolete])
 677         if total == 0:
 678             return 100
 679         translated = len(self.translated_entries())
 680         return int(translated * 100 / float(total))
 681
 682     def translated_entries(self):
 683         """
 684         Convenience method that returns the list of translated entries.
 685         """
 686         return [e for e in self if e.translated()]
 687
 688     def untranslated_entries(self):
 689         """
 690         Convenience method that returns the list of untranslated entries.
 691         """
 692         return [e for e in self if not e.translated() and not e.obsolete
 693                 and not e.fuzzy]
 694
 695     def fuzzy_entries(self):
 696         """
 697         Convenience method that returns the list of fuzzy entries.
 698         """
 699         return [e for e in self if e.fuzzy and not e.obsolete]
 700
 701     def obsolete_entries(self):
 702         """
 703         Convenience method that returns the list of obsolete entries.
 704         """
 705         return [e for e in self if e.obsolete]
 706
 707     def merge(self, refpot):
 708         """
 709         Convenience method that merges the current pofile with the pot file
 710         provided. It behaves exactly as the gettext msgmerge utility:
 711
 712         * comments of this file will be preserved, but extracted comments and
 713           occurrences will be discarded;
 714         * any translations or comments in the file will be discarded, however,
 715           dot comments and file positions will be preserved;
 716         * the fuzzy flags are preserved.
 717
 718         Keyword argument:
 719
 720         ``refpot``
 721             object POFile, the reference catalog.
 722         """
 723         # Store entries in dict/set for faster access
 724         self_entries = dict(
 725             (entry.msgid_with_context, entry) for entry in self
 726         )
 727         refpot_msgids = set(entry.msgid_with_context for entry in refpot)
 728         # Merge entries that are in the refpot
 729         for entry in refpot:
 730             e = self_entries.get(entry.msgid_with_context)
 731             if e is None:
 732                 e = POEntry()
 733                 self.append(e)
 734             e.merge(entry)
 735         # ok, now we must "obsolete" entries that are not in the refpot anymore
 736         for entry in self:
 737             if entry.msgid_with_context not in refpot_msgids:
 738                 entry.obsolete = True
 739 # }}}
 740 # class MOFile {{{
 741
 742
 743 class MOFile(_BaseFile):
 744     """
 745     Mo file reader/writer.
 746     This class inherits the :class:`~polib._BaseFile` class and, by
 747     extension, the python ``list`` type.
 748     """
 749     MAGIC = 0x950412de
 750     MAGIC_SWAPPED = 0xde120495
 751
 752     def __init__(self, *args, **kwargs):
 753         """
 754         Constructor, accepts all keywords arguments accepted by
 755         :class:`~polib._BaseFile` class.
 756         """
 757         _BaseFile.__init__(self, *args, **kwargs)
 758         self.magic_number = None
 759         self.version = 0
 760
 761     def save_as_pofile(self, fpath):
 762         """
 763         Saves the mofile as a pofile to ``fpath``.
 764
 765         Keyword argument:
 766
 767         ``fpath``
 768             string, full or relative path to the file.
 769         """
 770         _BaseFile.save(self, fpath)
 771
 772     def save(self, fpath=None):
 773         """
 774         Saves the mofile to ``fpath``.
 775
 776         Keyword argument:
 777
 778         ``fpath``
 779             string, full or relative path to the file.
 780         """
 781         _BaseFile.save(self, fpath, 'to_binary')
 782
 783     def percent_translated(self):
 784         """
 785         Convenience method to keep the same interface with POFile instances.
 786         """
 787         return 100
 788
 789     def translated_entries(self):
 790         """
 791         Convenience method to keep the same interface with POFile instances.
 792         """
 793         return self
 794
 795     def untranslated_entries(self):
 796         """
 797         Convenience method to keep the same interface with POFile instances.
 798         """
 799         return []
 800
 801     def fuzzy_entries(self):
 802         """
 803         Convenience method to keep the same interface with POFile instances.
 804         """
 805         return []
 806
 807     def obsolete_entries(self):
 808         """
 809         Convenience method to keep the same interface with POFile instances.
 810         """
 811         return []
 812 # }}}
 813 # class _BaseEntry {{{
 814
 815
 816 class _BaseEntry(object):
 817     """
 818     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
 819     This class should **not** be instantiated directly.
 820     """
 821
 822     def __init__(self, *args, **kwargs):
 823         """
 824         Constructor, accepts the following keyword arguments:
 825
 826         ``msgid``
 827             string, the entry msgid.
 828
 829         ``msgstr``
 830             string, the entry msgstr.
 831
 832         ``msgid_plural``
 833             string, the entry msgid_plural.
 834
 835         ``msgstr_plural``
 836             dict, the entry msgstr_plural lines.
 837
 838         ``msgctxt``
 839             string, the entry context (msgctxt).
 840
 841         ``obsolete``
 842             bool, whether the entry is "obsolete" or not.
 843
 844         ``encoding``
 845             string, the encoding to use, defaults to ``default_encoding``
 846             global variable (optional).
 847         """
 848         self.msgid = kwargs.get('msgid', '')
 849         self.msgstr = kwargs.get('msgstr', '')
 850         self.msgid_plural = kwargs.get('msgid_plural', '')
 851         self.msgstr_plural = kwargs.get('msgstr_plural', {})
 852         self.msgctxt = kwargs.get('msgctxt', None)
 853         self.obsolete = kwargs.get('obsolete', False)
 854         self.encoding = kwargs.get('encoding', default_encoding)
 855
 856     def __unicode__(self, wrapwidth=78):
 857         """
 858         Returns the unicode representation of the entry.
 859         """
 860         if self.obsolete:
 861             delflag = '#~ '
 862         else:
 863             delflag = ''
 864         ret = []
 865         # write the msgctxt if any
 866         if self.msgctxt is not None:
 867             ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
 868                                    wrapwidth)
 869         # write the msgid
 870         ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
 871         # write the msgid_plural if any
 872         if self.msgid_plural:
 873             ret += self._str_field("msgid_plural", delflag, "",
 874                                    self.msgid_plural, wrapwidth)
 875         if self.msgstr_plural:
 876             # write the msgstr_plural if any
 877             msgstrs = self.msgstr_plural
 878             keys = list(msgstrs)
 879             keys.sort()
 880             for index in keys:
 881                 msgstr = msgstrs[index]
 882                 plural_index = '[%s]' % index
 883                 ret += self._str_field("msgstr", delflag, plural_index, msgstr,
 884                                        wrapwidth)
 885         else:
 886             # otherwise write the msgstr
 887             ret += self._str_field("msgstr", delflag, "", self.msgstr,
 888                                    wrapwidth)
 889         ret.append('')
 890         ret = u('\n').join(ret)
 891         return ret
 892
 893     if PY3:
 894         def __str__(self):
 895             return self.__unicode__()
 896     else:
 897         def __str__(self):
 898             """
 899             Returns the string representation of the entry.
 900             """
 901             return unicode(self).encode(self.encoding)
 902
 903     def __eq__(self, other):
 904         return str(self) == str(other)
 905
 906     def _str_field(self, fieldname, delflag, plural_index, field,
 907                    wrapwidth=78):
 908         lines = field.splitlines(True)
 909         if len(lines) > 1:
 910             lines = [''] + lines  # start with initial empty line
 911         else:
 912             escaped_field = escape(field)
 913             specialchars_count = 0
 914             for c in ['\\', '\n', '\r', '\t', '"']:
 915                 specialchars_count += field.count(c)
 916             # comparison must take into account fieldname length + one space
 917             # + 2 quotes (eg. msgid "<string>")
 918             flength = len(fieldname) + 3
 919             if plural_index:
 920                 flength += len(plural_index)
 921             real_wrapwidth = wrapwidth - flength + specialchars_count
 922             if wrapwidth > 0 and len(field) > real_wrapwidth:
 923                 # Wrap the line but take field name into account
 924                 lines = [''] + [unescape(item) for item in textwrap.wrap(
 925                     escaped_field,
 926                     wrapwidth - 2,  # 2 for quotes ""
 927                     drop_whitespace=False,
 928                     break_long_words=False
 929                 )]
 930             else:
 931                 lines = [field]
 932         if fieldname.startswith('previous_'):
 933             # quick and dirty trick to get the real field name
 934             fieldname = fieldname[9:]
 935
 936         ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
 937                                 escape(lines.pop(0)))]
 938         for line in lines:
 939             ret.append('%s"%s"' % (delflag, escape(line)))
 940         return ret
 941
 942     @property
 943     def msgid_with_context(self):
 944         if self.msgctxt:
 945             return '%s%s%s' % (self.msgctxt, "\x04", self.msgid)
 946         return self.msgid
 947 # }}}
 948 # class POEntry {{{
 949
 950
 951 class POEntry(_BaseEntry):
 952     """
 953     Represents a po file entry.
 954     """
 955
 956     def __init__(self, *args, **kwargs):
 957         """
 958         Constructor, accepts the following keyword arguments:
 959
 960         ``comment``
 961             string, the entry comment.
 962
 963         ``tcomment``
 964             string, the entry translator comment.
 965
 966         ``occurrences``
 967             list, the entry occurrences.
 968
 969         ``flags``
 970             list, the entry flags.
 971
 972         ``previous_msgctxt``
 973             string, the entry previous context.
 974
 975         ``previous_msgid``
 976             string, the entry previous msgid.
 977
 978         ``previous_msgid_plural``
 979             string, the entry previous msgid_plural.
 980
 981         ``linenum``
 982             integer, the line number of the entry
 983         """
 984         _BaseEntry.__init__(self, *args, **kwargs)
 985         self.comment = kwargs.get('comment', '')
 986         self.tcomment = kwargs.get('tcomment', '')
 987         self.occurrences = kwargs.get('occurrences', [])
 988         self.flags = kwargs.get('flags', [])
 989         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
 990         self.previous_msgid = kwargs.get('previous_msgid', None)
 991         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
 992         self.linenum = kwargs.get('linenum', None)
 993
 994     def __unicode__(self, wrapwidth=78):
 995         """
 996         Returns the unicode representation of the entry.
 997         """
 998         ret = []
 999         # comments first, if any (with text wrapping as xgettext does)
1000         if self.obsolete:
1001             comments = [('tcomment', '# ')]
1002         else:
1003             comments = [('comment', '#. '), ('tcomment', '# ')]
1004         for c in comments:
1005             val = getattr(self, c[0])
1006             if val:
1007                 for comment in val.split('\n'):
1008                     if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
1009                         ret += textwrap.wrap(
1010                             comment,
1011                             wrapwidth,
1012                             initial_indent=c[1],
1013                             subsequent_indent=c[1],
1014                             break_long_words=False
1015                         )
1016                     else:
1017                         ret.append('%s%s' % (c[1], comment))
1018
1019         # occurrences (with text wrapping as xgettext does)
1020         if not self.obsolete and self.occurrences:
1021             filelist = []
1022             for fpath, lineno in self.occurrences:
1023                 if lineno:
1024                     filelist.append('%s:%s' % (fpath, lineno))
1025                 else:
1026                     filelist.append(fpath)
1027             filestr = ' '.join(filelist)
1028             if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
1029                 # textwrap split words that contain hyphen, this is not
1030                 # what we want for filenames, so the dirty hack is to
1031                 # temporally replace hyphens with a char that a file cannot
1032                 # contain, like "*"
1033                 ret += [line.replace('*', '-') for line in textwrap.wrap(
1034                     filestr.replace('-', '*'),
1035                     wrapwidth,
1036                     initial_indent='#: ',
1037                     subsequent_indent='#: ',
1038                     break_long_words=False
1039                 )]
1040             else:
1041                 ret.append('#: ' + filestr)
1042
1043         # flags (TODO: wrapping ?)
1044         if self.flags:
1045             ret.append('#, %s' % ', '.join(self.flags))
1046
1047         # previous context and previous msgid/msgid_plural
1048         fields = ['previous_msgctxt', 'previous_msgid',
1049                   'previous_msgid_plural']
1050         if self.obsolete:
1051             prefix = "#~| "
1052         else:
1053             prefix = "#| "
1054         for f in fields:
1055             val = getattr(self, f)
1056             if val is not None:
1057                 ret += self._str_field(f, prefix, "", val, wrapwidth)
1058
1059         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
1060         ret = u('\n').join(ret)
1061         return ret
1062
1063     def __cmp__(self, other):
1064         """
1065         Called by comparison operations if rich comparison is not defined.
1066         """
1067         # First: Obsolete test
1068         if self.obsolete != other.obsolete:
1069             if self.obsolete:
1070                 return -1
1071             else:
1072                 return 1
1073         # Work on a copy to protect original
1074         occ1 = sorted(self.occurrences[:])
1075         occ2 = sorted(other.occurrences[:])
1076         if occ1 > occ2:
1077             return 1
1078         if occ1 < occ2:
1079             return -1
1080         # Compare context
1081         msgctxt = self.msgctxt or '0'
1082         othermsgctxt = other.msgctxt or '0'
1083         if msgctxt > othermsgctxt:
1084             return 1
1085         elif msgctxt < othermsgctxt:
1086             return -1
1087         # Compare msgid_plural
1088         msgid_plural = self.msgid_plural or '0'
1089         othermsgid_plural = other.msgid_plural or '0'
1090         if msgid_plural > othermsgid_plural:
1091             return 1
1092         elif msgid_plural < othermsgid_plural:
1093             return -1
1094         # Compare msgstr_plural
1095         if self.msgstr_plural and isinstance(self.msgstr_plural, dict):
1096             msgstr_plural = list(self.msgstr_plural.values())
1097         else:
1098             msgstr_plural = []
1099         if other.msgstr_plural and isinstance(other.msgstr_plural, dict):
1100             othermsgstr_plural = list(other.msgstr_plural.values())
1101         else:
1102             othermsgstr_plural = []
1103         if msgstr_plural > othermsgstr_plural:
1104             return 1
1105         elif msgstr_plural < othermsgstr_plural:
1106             return -1
1107         # Compare msgid
1108         if self.msgid > other.msgid:
1109             return 1
1110         elif self.msgid < other.msgid:
1111             return -1
1112         # Compare msgstr
1113         if self.msgstr > other.msgstr:
1114             return 1
1115         elif self.msgstr < other.msgstr:
1116             return -1
1117         return 0
1118
1119     def __gt__(self, other):
1120         return self.__cmp__(other) > 0
1121
1122     def __lt__(self, other):
1123         return self.__cmp__(other) < 0
1124
1125     def __ge__(self, other):
1126         return self.__cmp__(other) >= 0
1127
1128     def __le__(self, other):
1129         return self.__cmp__(other) <= 0
1130
1131     def __eq__(self, other):
1132         return self.__cmp__(other) == 0
1133
1134     def __ne__(self, other):
1135         return self.__cmp__(other) != 0
1136
1137     def translated(self):
1138         """
1139         Returns ``True`` if the entry has been translated or ``False``
1140         otherwise.
1141         """
1142         if self.obsolete or self.fuzzy:
1143             return False
1144         if self.msgstr != '':
1145             return True
1146         if self.msgstr_plural:
1147             for pos in self.msgstr_plural:
1148                 if self.msgstr_plural[pos] == '':
1149                     return False
1150             return True
1151         return False
1152
1153     def merge(self, other):
1154         """
1155         Merge the current entry with the given pot entry.
1156         """
1157         self.msgid = other.msgid
1158         self.msgctxt = other.msgctxt
1159         self.occurrences = other.occurrences
1160         self.comment = other.comment
1161         fuzzy = self.fuzzy
1162         self.flags = other.flags[:]  # clone flags
1163         if fuzzy:
1164             self.flags.append('fuzzy')
1165         self.msgid_plural = other.msgid_plural
1166         self.obsolete = other.obsolete
1167         self.previous_msgctxt = other.previous_msgctxt
1168         self.previous_msgid = other.previous_msgid
1169         self.previous_msgid_plural = other.previous_msgid_plural
1170         if other.msgstr_plural:
1171             for pos in other.msgstr_plural:
1172                 try:
1173                     # keep existing translation at pos if any
1174                     self.msgstr_plural[pos]
1175                 except KeyError:
1176                     self.msgstr_plural[pos] = ''
1177
1178     @property
1179     def fuzzy(self):
1180         return 'fuzzy' in self.flags
1181
1182     def __hash__(self):
1183         return hash((self.msgid, self.msgstr))
1184 # }}}
1185 # class MOEntry {{{
1186
1187
1188 class MOEntry(_BaseEntry):
1189     """
1190     Represents a mo file entry.
1191     """
1192     def __init__(self, *args, **kwargs):
1193         """
1194         Constructor, accepts the following keyword arguments,
1195         for consistency with :class:`~polib.POEntry`:
1196
1197         ``comment``
1198         ``tcomment``
1199         ``occurrences``
1200         ``flags``
1201         ``previous_msgctxt``
1202         ``previous_msgid``
1203         ``previous_msgid_plural``
1204
1205         Note: even though these keyword arguments are accepted,
1206         they hold no real meaning in the context of MO files
1207         and are simply ignored.
1208         """
1209         _BaseEntry.__init__(self, *args, **kwargs)
1210         self.comment = ''
1211         self.tcomment = ''
1212         self.occurrences = []
1213         self.flags = []
1214         self.previous_msgctxt = None
1215         self.previous_msgid = None
1216         self.previous_msgid_plural = None
1217
1218     def __hash__(self):
1219         return hash((self.msgid, self.msgstr))
1220
1221 # }}}
1222 # class _POFileParser {{{
1223
1224
1225 class _POFileParser(object):
1226     """
1227     A finite state machine to parse efficiently and correctly po
1228     file format.
1229     """
1230
1231     def __init__(self, pofile, *args, **kwargs):
1232         """
1233         Constructor.
1234
1235         Keyword arguments:
1236
1237         ``pofile``
1238             string, path to the po file or its content
1239
1240         ``encoding``
1241             string, the encoding to use, defaults to ``default_encoding``
1242             global variable (optional).
1243
1244         ``check_for_duplicates``
1245             whether to check for duplicate entries when adding entries to the
1246             file (optional, default: ``False``).
1247         """
1248         enc = kwargs.get('encoding', default_encoding)
1249         if _is_file(pofile):
1250             try:
1251                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1252             except LookupError:
1253                 enc = default_encoding
1254                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1255         else:
1256             self.fhandle = pofile.splitlines()
1257
1258         klass = kwargs.get('klass')
1259         if klass is None:
1260             klass = POFile
1261         self.instance = klass(
1262             pofile=pofile,
1263             encoding=enc,
1264             check_for_duplicates=kwargs.get('check_for_duplicates', False)
1265         )
1266         self.transitions = {}
1267         self.current_line = 0
1268         self.current_entry = POEntry(linenum=self.current_line)
1269         self.current_state = 'st'
1270         self.current_token = None
1271         # two memo flags used in handlers
1272         self.msgstr_index = 0
1273         self.entry_obsolete = 0
1274         # Configure the state machine, by adding transitions.
1275         # Signification of symbols:
1276         #     * ST: Beginning of the file (start)
1277         #     * HE: Header
1278         #     * TC: a translation comment
1279         #     * GC: a generated comment
1280         #     * OC: a file/line occurrence
1281         #     * FL: a flags line
1282         #     * CT: a message context
1283         #     * PC: a previous msgctxt
1284         #     * PM: a previous msgid
1285         #     * PP: a previous msgid_plural
1286         #     * MI: a msgid
1287         #     * MP: a msgid plural
1288         #     * MS: a msgstr
1289         #     * MX: a msgstr plural
1290         #     * MC: a msgid or msgstr continuation line
1291         all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
1292                'ms', 'mp', 'mx', 'mi']
1293
1294         self.add('tc', ['st', 'he'],                                     'he')
1295         self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
1296                         'mp', 'mx', 'mi'],                               'tc')
1297         self.add('gc', all,                                              'gc')
1298         self.add('oc', all,                                              'oc')
1299         self.add('fl', all,                                              'fl')
1300         self.add('pc', all,                                              'pc')
1301         self.add('pm', all,                                              'pm')
1302         self.add('pp', all,                                              'pp')
1303         self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
1304                         'pp', 'ms', 'mx'],                               'ct')
1305         self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
1306                  'pm', 'pp', 'ms', 'mx'],                                'mi')
1307         self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'],             'mp')
1308         self.add('ms', ['mi', 'mp', 'tc'],                               'ms')
1309         self.add('mx', ['mi', 'mx', 'mp', 'tc'],                         'mx')
1310         self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1311
1312     def parse(self):
1313         """
1314         Run the state machine, parse the file line by line and call process()
1315         with the current matched symbol.
1316         """
1317
1318         keywords = {
1319             'msgctxt': 'ct',
1320             'msgid': 'mi',
1321             'msgstr': 'ms',
1322             'msgid_plural': 'mp',
1323         }
1324         prev_keywords = {
1325             'msgid_plural': 'pp',
1326             'msgid': 'pm',
1327             'msgctxt': 'pc',
1328         }
1329         tokens = []
1330         fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1331         for line in self.fhandle:
1332             self.current_line += 1
1333             if self.current_line == 1:
1334                 BOM = codecs.BOM_UTF8.decode('utf-8')
1335                 if line.startswith(BOM):
1336                     line = line[len(BOM):]
1337             line = line.strip()
1338             if line == '':
1339                 continue
1340
1341             tokens = line.split(None, 2)
1342             nb_tokens = len(tokens)
1343
1344             if tokens[0] == '#~|':
1345                 continue
1346
1347             if tokens[0] == '#~' and nb_tokens > 1:
1348                 line = line[3:].strip()
1349                 tokens = tokens[1:]
1350                 nb_tokens -= 1
1351                 self.entry_obsolete = 1
1352             else:
1353                 self.entry_obsolete = 0
1354
1355             # Take care of keywords like
1356             # msgid, msgid_plural, msgctxt & msgstr.
1357             if tokens[0] in keywords and nb_tokens > 1:
1358                 line = line[len(tokens[0]):].lstrip()
1359                 if re.search(r'([^\\]|^)"', line[1:-1]):
1360                     raise IOError('Syntax error in po file %s(line %s): '
1361                                   'unescaped double quote found' %
1362                                   (fpath, self.current_line))
1363                 self.current_token = line
1364                 self.process(keywords[tokens[0]])
1365                 continue
1366
1367             self.current_token = line
1368
1369             if tokens[0] == '#:':
1370                 if nb_tokens <= 1:
1371                     continue
1372                 # we are on a occurrences line
1373                 self.process('oc')
1374
1375             elif line[:1] == '"':
1376                 # we are on a continuation line
1377                 if re.search(r'([^\\]|^)"', line[1:-1]):
1378                     raise IOError('Syntax error in po file %s(line %s): '
1379                                   'unescaped double quote found' %
1380                                   (fpath, self.current_line))
1381                 self.process('mc')
1382
1383             elif line[:7] == 'msgstr[':
1384                 # we are on a msgstr plural
1385                 self.process('mx')
1386
1387             elif tokens[0] == '#,':
1388                 if nb_tokens <= 1:
1389                     continue
1390                 # we are on a flags line
1391                 self.process('fl')
1392
1393             elif tokens[0] == '#' or tokens[0].startswith('##'):
1394                 if line == '#':
1395                     line += ' '
1396                 # we are on a translator comment line
1397                 self.process('tc')
1398
1399             elif tokens[0] == '#.':
1400                 if nb_tokens <= 1:
1401                     continue
1402                 # we are on a generated comment line
1403                 self.process('gc')
1404
1405             elif tokens[0] == '#|':
1406                 if nb_tokens <= 1:
1407                     raise IOError('Syntax error in po file %s(line %s)' %
1408                                   (fpath, self.current_line))
1409
1410                 # Remove the marker and any whitespace right after that.
1411                 line = line[2:].lstrip()
1412                 self.current_token = line
1413
1414                 if tokens[1].startswith('"'):
1415                     # Continuation of previous metadata.
1416                     self.process('mc')
1417                     continue
1418
1419                 if nb_tokens == 2:
1420                     # Invalid continuation line.
1421                     raise IOError('Syntax error in po file %s(line %s): '
1422                                   'invalid continuation line' %
1423                                   (fpath, self.current_line))
1424
1425                 # we are on a "previous translation" comment line,
1426                 if tokens[1] not in prev_keywords:
1427                     # Unknown keyword in previous translation comment.
1428                     raise IOError('Syntax error in po file %s(line %s): '
1429                                   'unknown keyword %s' %
1430                                   (fpath, self.current_line,
1431                                    tokens[1]))
1432
1433                 # Remove the keyword and any whitespace
1434                 # between it and the starting quote.
1435                 line = line[len(tokens[1]):].lstrip()
1436                 self.current_token = line
1437                 self.process(prev_keywords[tokens[1]])
1438
1439             else:
1440                 raise IOError('Syntax error in po file %s(line %s)' %
1441                               (fpath, self.current_line))
1442
1443         if self.current_entry and len(tokens) > 0 and \
1444            not tokens[0].startswith('#'):
1445             # since entries are added when another entry is found, we must add
1446             # the last entry here (only if there are lines). Trailing comments
1447             # are ignored
1448             self.instance.append(self.current_entry)
1449
1450         # before returning the instance, check if there's metadata and if
1451         # so extract it in a dict
1452         metadataentry = self.instance.find('')
1453         if metadataentry:  # metadata found
1454             # remove the entry
1455             self.instance.remove(metadataentry)
1456             self.instance.metadata_is_fuzzy = metadataentry.flags
1457             key = None
1458             for msg in metadataentry.msgstr.splitlines():
1459                 try:
1460                     key, val = msg.split(':', 1)
1461                     self.instance.metadata[key] = val.strip()
1462                 except (ValueError, KeyError):
1463                     if key is not None:
1464                         self.instance.metadata[key] += '\n' + msg.strip()
1465         # close opened file
1466         if not isinstance(self.fhandle, list):  # must be file
1467             self.fhandle.close()
1468         return self.instance
1469
1470     def add(self, symbol, states, next_state):
1471         """
1472         Add a transition to the state machine.
1473
1474         Keywords arguments:
1475
1476         ``symbol``
1477             string, the matched token (two chars symbol).
1478
1479         ``states``
1480             list, a list of states (two chars symbols).
1481
1482         ``next_state``
1483             the next state the fsm will have after the action.
1484         """
1485         for state in states:
1486             action = getattr(self, 'handle_%s' % next_state)
1487             self.transitions[(symbol, state)] = (action, next_state)
1488
1489     def process(self, symbol):
1490         """
1491         Process the transition corresponding to the current state and the
1492         symbol provided.
1493
1494         Keywords arguments:
1495
1496         ``symbol``
1497             string, the matched token (two chars symbol).
1498
1499         ``linenum``
1500             integer, the current line number of the parsed file.
1501         """
1502         try:
1503             (action, state) = self.transitions[(symbol, self.current_state)]
1504             if action():
1505                 self.current_state = state
1506         except Exception:
1507             fpath = '%s ' % self.instance.fpath if self.instance.fpath else ''
1508             if hasattr(self.fhandle, 'close'):
1509                 self.fhandle.close()
1510             raise IOError('Syntax error in po file %s(line %s)' %
1511                           (fpath, self.current_line))
1512
1513     # state handlers
1514
1515     def handle_he(self):
1516         """Handle a header comment."""
1517         if self.instance.header != '':
1518             self.instance.header += '\n'
1519         self.instance.header += self.current_token[2:]
1520         return 1
1521
1522     def handle_tc(self):
1523         """Handle a translator comment."""
1524         if self.current_state in ['mc', 'ms', 'mx']:
1525             self.instance.append(self.current_entry)
1526             self.current_entry = POEntry(linenum=self.current_line)
1527         if self.current_entry.tcomment != '':
1528             self.current_entry.tcomment += '\n'
1529         tcomment = self.current_token.lstrip('#')
1530         if tcomment.startswith(' '):
1531             tcomment = tcomment[1:]
1532         self.current_entry.tcomment += tcomment
1533         return True
1534
1535     def handle_gc(self):
1536         """Handle a generated comment."""
1537         if self.current_state in ['mc', 'ms', 'mx']:
1538             self.instance.append(self.current_entry)
1539             self.current_entry = POEntry(linenum=self.current_line)
1540         if self.current_entry.comment != '':
1541             self.current_entry.comment += '\n'
1542         self.current_entry.comment += self.current_token[3:]
1543         return True
1544
1545     def handle_oc(self):
1546         """Handle a file:num occurrence."""
1547         if self.current_state in ['mc', 'ms', 'mx']:
1548             self.instance.append(self.current_entry)
1549             self.current_entry = POEntry(linenum=self.current_line)
1550         occurrences = self.current_token[3:].split()
1551         for occurrence in occurrences:
1552             if occurrence != '':
1553                 try:
1554                     fil, line = occurrence.rsplit(':', 1)
1555                     if not line.isdigit():
1556                         fil = occurrence
1557                         line = ''
1558                     self.current_entry.occurrences.append((fil, line))
1559                 except (ValueError, AttributeError):
1560                     self.current_entry.occurrences.append((occurrence, ''))
1561         return True
1562
1563     def handle_fl(self):
1564         """Handle a flags line."""
1565         if self.current_state in ['mc', 'ms', 'mx']:
1566             self.instance.append(self.current_entry)
1567             self.current_entry = POEntry(linenum=self.current_line)
1568         self.current_entry.flags += [c.strip() for c in
1569                                      self.current_token[3:].split(',')]
1570         return True
1571
1572     def handle_pp(self):
1573         """Handle a previous msgid_plural line."""
1574         if self.current_state in ['mc', 'ms', 'mx']:
1575             self.instance.append(self.current_entry)
1576             self.current_entry = POEntry(linenum=self.current_line)
1577         self.current_entry.previous_msgid_plural = \
1578             unescape(self.current_token[1:-1])
1579         return True
1580
1581     def handle_pm(self):
1582         """Handle a previous msgid line."""
1583         if self.current_state in ['mc', 'ms', 'mx']:
1584             self.instance.append(self.current_entry)
1585             self.current_entry = POEntry(linenum=self.current_line)
1586         self.current_entry.previous_msgid = \
1587             unescape(self.current_token[1:-1])
1588         return True
1589
1590     def handle_pc(self):
1591         """Handle a previous msgctxt line."""
1592         if self.current_state in ['mc', 'ms', 'mx']:
1593             self.instance.append(self.current_entry)
1594             self.current_entry = POEntry(linenum=self.current_line)
1595         self.current_entry.previous_msgctxt = \
1596             unescape(self.current_token[1:-1])
1597         return True
1598
1599     def handle_ct(self):
1600         """Handle a msgctxt."""
1601         if self.current_state in ['mc', 'ms', 'mx']:
1602             self.instance.append(self.current_entry)
1603             self.current_entry = POEntry(linenum=self.current_line)
1604         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1605         return True
1606
1607     def handle_mi(self):
1608         """Handle a msgid."""
1609         if self.current_state in ['mc', 'ms', 'mx']:
1610             self.instance.append(self.current_entry)
1611             self.current_entry = POEntry(linenum=self.current_line)
1612         self.current_entry.obsolete = self.entry_obsolete
1613         self.current_entry.msgid = unescape(self.current_token[1:-1])
1614         return True
1615
1616     def handle_mp(self):
1617         """Handle a msgid plural."""
1618         self.current_entry.msgid_plural = unescape(self.current_token[1:-1])
1619         return True
1620
1621     def handle_ms(self):
1622         """Handle a msgstr."""
1623         self.current_entry.msgstr = unescape(self.current_token[1:-1])
1624         return True
1625
1626     def handle_mx(self):
1627         """Handle a msgstr plural."""
1628         index = self.current_token[7]
1629         value = self.current_token[self.current_token.find('"') + 1:-1]
1630         self.current_entry.msgstr_plural[int(index)] = unescape(value)
1631         self.msgstr_index = int(index)
1632         return True
1633
1634     def handle_mc(self):
1635         """Handle a msgid or msgstr continuation line."""
1636         token = unescape(self.current_token[1:-1])
1637         if self.current_state == 'ct':
1638             self.current_entry.msgctxt += token
1639         elif self.current_state == 'mi':
1640             self.current_entry.msgid += token
1641         elif self.current_state == 'mp':
1642             self.current_entry.msgid_plural += token
1643         elif self.current_state == 'ms':
1644             self.current_entry.msgstr += token
1645         elif self.current_state == 'mx':
1646             self.current_entry.msgstr_plural[self.msgstr_index] += token
1647         elif self.current_state == 'pp':
1648             self.current_entry.previous_msgid_plural += token
1649         elif self.current_state == 'pm':
1650             self.current_entry.previous_msgid += token
1651         elif self.current_state == 'pc':
1652             self.current_entry.previous_msgctxt += token
1653         # don't change the current state
1654         return False
1655 # }}}
1656 # class _MOFileParser {{{
1657
1658
1659 class _MOFileParser(object):
1660     """
1661     A class to parse binary mo files.
1662     """
1663
1664     def __init__(self, mofile, *args, **kwargs):
1665         """
1666         Constructor.
1667
1668         Keyword arguments:
1669
1670         ``mofile``
1671             string, path to the mo file or its content
1672
1673         ``encoding``
1674             string, the encoding to use, defaults to ``default_encoding``
1675             global variable (optional).
1676
1677         ``check_for_duplicates``
1678             whether to check for duplicate entries when adding entries to the
1679             file (optional, default: ``False``).
1680         """
1681         if _is_file(mofile):
1682             self.fhandle = open(mofile, 'rb')
1683         else:
1684             self.fhandle = io.BytesIO(mofile)
1685
1686         klass = kwargs.get('klass')
1687         if klass is None:
1688             klass = MOFile
1689         self.instance = klass(
1690             fpath=mofile,
1691             encoding=kwargs.get('encoding', default_encoding),
1692             check_for_duplicates=kwargs.get('check_for_duplicates', False)
1693         )
1694
1695     def __del__(self):
1696         """
1697         Make sure the file is closed, this prevents warnings on unclosed file
1698         when running tests with python >= 3.2.
1699         """
1700         if self.fhandle and hasattr(self.fhandle, 'close'):
1701             self.fhandle.close()
1702
1703     def parse(self):
1704         """
1705         Build the instance with the file handle provided in the
1706         constructor.
1707         """
1708         # parse magic number
1709         magic_number = self._readbinary('<I', 4)
1710         if magic_number == MOFile.MAGIC:
1711             ii = '<II'
1712         elif magic_number == MOFile.MAGIC_SWAPPED:
1713             ii = '>II'
1714         else:
1715             raise IOError('Invalid mo file, magic number is incorrect !')
1716         self.instance.magic_number = magic_number
1717         # parse the version number and the number of strings
1718         version, numofstrings = self._readbinary(ii, 8)
1719         # from MO file format specs: "A program seeing an unexpected major
1720         # revision number should stop reading the MO file entirely"
1721         if version >> 16 not in (0, 1):
1722             raise IOError('Invalid mo file, unexpected major revision number')
1723         self.instance.version = version
1724         # original strings and translation strings hash table offset
1725         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1726         # move to msgid hash table and read length and offset of msgids
1727         self.fhandle.seek(msgids_hash_offset)
1728         msgids_index = []
1729         for i in range(numofstrings):
1730             msgids_index.append(self._readbinary(ii, 8))
1731         # move to msgstr hash table and read length and offset of msgstrs
1732         self.fhandle.seek(msgstrs_hash_offset)
1733         msgstrs_index = []
1734         for i in range(numofstrings):
1735             msgstrs_index.append(self._readbinary(ii, 8))
1736         # build entries
1737         encoding = self.instance.encoding
1738         for i in range(numofstrings):
1739             self.fhandle.seek(msgids_index[i][1])
1740             msgid = self.fhandle.read(msgids_index[i][0])
1741
1742             self.fhandle.seek(msgstrs_index[i][1])
1743             msgstr = self.fhandle.read(msgstrs_index[i][0])
1744             if i == 0 and not msgid:  # metadata
1745                 raw_metadata, metadata = msgstr.split(b('\n')), {}
1746                 for line in raw_metadata:
1747                     tokens = line.split(b(':'), 1)
1748                     if tokens[0] != b(''):
1749                         try:
1750                             k = tokens[0].decode(encoding)
1751                             v = tokens[1].decode(encoding)
1752                             metadata[k] = v.strip()
1753                         except IndexError:
1754                             metadata[k] = u('')
1755                 self.instance.metadata = metadata
1756                 continue
1757             # test if we have a plural entry
1758             msgid_tokens = msgid.split(b('\0'))
1759             if len(msgid_tokens) > 1:
1760                 entry = self._build_entry(
1761                     msgid=msgid_tokens[0],
1762                     msgid_plural=msgid_tokens[1],
1763                     msgstr_plural=dict((k, v) for k, v in
1764                                        enumerate(msgstr.split(b('\0'))))
1765                 )
1766             else:
1767                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1768             self.instance.append(entry)
1769         # close opened file
1770         self.fhandle.close()
1771         return self.instance
1772
1773     def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
1774                      msgstr_plural=None):
1775         msgctxt_msgid = msgid.split(b('\x04'))
1776         encoding = self.instance.encoding
1777         if len(msgctxt_msgid) > 1:
1778             kwargs = {
1779                 'msgctxt': msgctxt_msgid[0].decode(encoding),
1780                 'msgid': msgctxt_msgid[1].decode(encoding),
1781             }
1782         else:
1783             kwargs = {'msgid': msgid.decode(encoding)}
1784         if msgstr:
1785             kwargs['msgstr'] = msgstr.decode(encoding)
1786         if msgid_plural:
1787             kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1788         if msgstr_plural:
1789             for k in msgstr_plural:
1790                 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1791             kwargs['msgstr_plural'] = msgstr_plural
1792         return MOEntry(**kwargs)
1793
1794     def _readbinary(self, fmt, numbytes):
1795         """
1796         Private method that unpack n bytes of data using format <fmt>.
1797         It returns a tuple or a mixed value if the tuple length is 1.
1798         """
1799         bytes = self.fhandle.read(numbytes)
1800         tup = struct.unpack(fmt, bytes)
1801         if len(tup) == 1:
1802             return tup[0]
1803         return tup
1804 # }}}