Lib/gettext.py

   1 """Internationalization and localization support.
   2
   3 This module provides internationalization (I18N) and localization (L10N)
   4 support for your Python programs by providing an interface to the GNU gettext
   5 message catalog library.
   6
   7 I18N refers to the operation by which a program is made aware of multiple
   8 languages.  L10N refers to the adaptation of your program, once
   9 internationalized, to the local language and cultural habits.
  10
  11 """
  12
  13 # This module represents the integration of work, contributions, feedback, and
  14 # suggestions from the following people:
  15 #
  16 # Martin von Loewis, who wrote the initial implementation of the underlying
  17 # C-based libintlmodule (later renamed _gettext), along with a skeletal
  18 # gettext.py implementation.
  19 #
  20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
  21 # which also included a pure-Python implementation to read .mo files if
  22 # intlmodule wasn't available.
  23 #
  24 # James Henstridge, who also wrote a gettext.py module, which has some
  25 # interesting, but currently unsupported experimental features: the notion of
  26 # a Catalog class and instances, and the ability to add to a catalog file via
  27 # a Python API.
  28 #
  29 # Barry Warsaw integrated these modules, wrote the .install() API and code,
  30 # and conformed all C and Python code to Python's coding standards.
  31 #
  32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
  33 # module.
  34 #
  35 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
  36 #
  37 # TODO:
  38 # - Lazy loading of .mo files.  Currently the entire catalog is loaded into
  39 #   memory, but that's probably bad for large translated programs.  Instead,
  40 #   the lexical sort of original strings in GNU .mo files should be exploited
  41 #   to do binary searches and lazy initializations.  Or you might want to use
  42 #   the undocumented double-hash algorithm for .mo files with hash tables, but
  43 #   you'll need to study the GNU gettext code to do this.
  44 #
  45 # - Support Solaris .mo file formats.  Unfortunately, we've been unable to
  46 #   find this format documented anywhere.
  47
  48
  49 import locale, copy, os, re, struct, sys
  50 from errno import ENOENT
  51
  52
  53 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
  54            'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
  55            'dgettext', 'dngettext', 'gettext', 'ngettext',
  56            ]
  57
  58 _default_localedir = os.path.join(sys.prefix, 'share', 'locale')
  59
  60
  61 def test(condition, true, false):
  62     """
  63     Implements the C expression:
  64
  65       condition ? true : false
  66
  67     Required to correctly interpret plural forms.
  68     """
  69     if condition:
  70         return true
  71     else:
  72         return false
  73
  74
  75 def c2py(plural):
  76     """Gets a C expression as used in PO files for plural forms and returns a
  77     Python lambda function that implements an equivalent expression.
  78     """
  79     # Security check, allow only the "n" identifier
  80     try:
  81         from cStringIO import StringIO
  82     except ImportError:
  83         from StringIO import StringIO
  84     import token, tokenize
  85     tokens = tokenize.generate_tokens(StringIO(plural).readline)
  86     try:
  87         danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
  88     except tokenize.TokenError:
  89         raise ValueError, \
  90               'plural forms expression error, maybe unbalanced parenthesis'
  91     else:
  92         if danger:
  93             raise ValueError, 'plural forms expression could be dangerous'
  94
  95     # Replace some C operators by their Python equivalents
  96     plural = plural.replace('&&', ' and ')
  97     plural = plural.replace('||', ' or ')
  98
  99     expr = re.compile(r'\!([^=])')
 100     plural = expr.sub(' not \\1', plural)
 101
 102     # Regular expression and replacement function used to transform
 103     # "a?b:c" to "test(a,b,c)".
 104     expr = re.compile(r'(.*?)\?(.*?):(.*)')
 105     def repl(x):
 106         return "test(%s, %s, %s)" % (x.group(1), x.group(2),
 107                                      expr.sub(repl, x.group(3)))
 108
 109     # Code to transform the plural expression, taking care of parentheses
 110     stack = ['']
 111     for c in plural:
 112         if c == '(':
 113             stack.append('')
 114         elif c == ')':
 115             if len(stack) == 1:
 116                 # Actually, we never reach this code, because unbalanced
 117                 # parentheses get caught in the security check at the
 118                 # beginning.
 119                 raise ValueError, 'unbalanced parenthesis in plural form'
 120             s = expr.sub(repl, stack.pop())
 121             stack[-1] += '(%s)' % s
 122         else:
 123             stack[-1] += c
 124     plural = expr.sub(repl, stack.pop())
 125
 126     return eval('lambda n: int(%s)' % plural)
 127
 128
 129
 130 def _expand_lang(locale):
 131     from locale import normalize
 132     locale = normalize(locale)
 133     COMPONENT_CODESET   = 1 << 0
 134     COMPONENT_TERRITORY = 1 << 1
 135     COMPONENT_MODIFIER  = 1 << 2
 136     # split up the locale into its base components
 137     mask = 0
 138     pos = locale.find('@')
 139     if pos >= 0:
 140         modifier = locale[pos:]
 141         locale = locale[:pos]
 142         mask |= COMPONENT_MODIFIER
 143     else:
 144         modifier = ''
 145     pos = locale.find('.')
 146     if pos >= 0:
 147         codeset = locale[pos:]
 148         locale = locale[:pos]
 149         mask |= COMPONENT_CODESET
 150     else:
 151         codeset = ''
 152     pos = locale.find('_')
 153     if pos >= 0:
 154         territory = locale[pos:]
 155         locale = locale[:pos]
 156         mask |= COMPONENT_TERRITORY
 157     else:
 158         territory = ''
 159     language = locale
 160     ret = []
 161     for i in range(mask+1):
 162         if not (i & ~mask):  # if all components for this combo exist ...
 163             val = language
 164             if i & COMPONENT_TERRITORY: val += territory
 165             if i & COMPONENT_CODESET:   val += codeset
 166             if i & COMPONENT_MODIFIER:  val += modifier
 167             ret.append(val)
 168     ret.reverse()
 169     return ret
 170
 171
 172
 173 class NullTranslations:
 174     def __init__(self, fp=None):
 175         self._info = {}
 176         self._charset = None
 177         self._output_charset = None
 178         self._fallback = None
 179         if fp is not None:
 180             self._parse(fp)
 181
 182     def _parse(self, fp):
 183         pass
 184
 185     def add_fallback(self, fallback):
 186         if self._fallback:
 187             self._fallback.add_fallback(fallback)
 188         else:
 189             self._fallback = fallback
 190
 191     def gettext(self, message):
 192         if self._fallback:
 193             return self._fallback.gettext(message)
 194         return message
 195
 196     def lgettext(self, message):
 197         if self._fallback:
 198             return self._fallback.lgettext(message)
 199         return message
 200
 201     def ngettext(self, msgid1, msgid2, n):
 202         if self._fallback:
 203             return self._fallback.ngettext(msgid1, msgid2, n)
 204         if n == 1:
 205             return msgid1
 206         else:
 207             return msgid2
 208
 209     def lngettext(self, msgid1, msgid2, n):
 210         if self._fallback:
 211             return self._fallback.lngettext(msgid1, msgid2, n)
 212         if n == 1:
 213             return msgid1
 214         else:
 215             return msgid2
 216
 217     def ugettext(self, message):
 218         if self._fallback:
 219             return self._fallback.ugettext(message)
 220         return unicode(message)
 221
 222     def ungettext(self, msgid1, msgid2, n):
 223         if self._fallback:
 224             return self._fallback.ungettext(msgid1, msgid2, n)
 225         if n == 1:
 226             return unicode(msgid1)
 227         else:
 228             return unicode(msgid2)
 229
 230     def info(self):
 231         return self._info
 232
 233     def charset(self):
 234         return self._charset
 235
 236     def output_charset(self):
 237         return self._output_charset
 238
 239     def set_output_charset(self, charset):
 240         self._output_charset = charset
 241
 242     def install(self, unicode=False, names=None):
 243         import __builtin__
 244         __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext
 245         if hasattr(names, "__contains__"):
 246             if "gettext" in names:
 247                 __builtin__.__dict__['gettext'] = __builtin__.__dict__['_']
 248             if "ngettext" in names:
 249                 __builtin__.__dict__['ngettext'] = (unicode and self.ungettext
 250                                                              or self.ngettext)
 251             if "lgettext" in names:
 252                 __builtin__.__dict__['lgettext'] = self.lgettext
 253             if "lngettext" in names:
 254                 __builtin__.__dict__['lngettext'] = self.lngettext
 255
 256
 257 class GNUTranslations(NullTranslations):
 258     # Magic number of .mo files
 259     LE_MAGIC = 0x950412deL
 260     BE_MAGIC = 0xde120495L
 261
 262     def _parse(self, fp):
 263         """Override this method to support alternative .mo formats."""
 264         unpack = struct.unpack
 265         filename = getattr(fp, 'name', '')
 266         # Parse the .mo file header, which consists of 5 little endian 32
 267         # bit words.
 268         self._catalog = catalog = {}
 269         self.plural = lambda n: int(n != 1) # germanic plural by default
 270         buf = fp.read()
 271         buflen = len(buf)
 272         # Are we big endian or little endian?
 273         magic = unpack('<I', buf[:4])[0]
 274         if magic == self.LE_MAGIC:
 275             version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
 276             ii = '<II'
 277         elif magic == self.BE_MAGIC:
 278             version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
 279             ii = '>II'
 280         else:
 281             raise IOError(0, 'Bad magic number', filename)
 282         # Now put all messages from the .mo file buffer into the catalog
 283         # dictionary.
 284         for i in xrange(0, msgcount):
 285             mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
 286             mend = moff + mlen
 287             tlen, toff = unpack(ii, buf[transidx:transidx+8])
 288             tend = toff + tlen
 289             if mend < buflen and tend < buflen:
 290                 msg = buf[moff:mend]
 291                 tmsg = buf[toff:tend]
 292             else:
 293                 raise IOError(0, 'File is corrupt', filename)
 294             # See if we're looking at GNU .mo conventions for metadata
 295             if mlen == 0:
 296                 # Catalog description
 297                 lastk = k = None
 298                 for item in tmsg.splitlines():
 299                     item = item.strip()
 300                     if not item:
 301                         continue
 302                     if ':' in item:
 303                         k, v = item.split(':', 1)
 304                         k = k.strip().lower()
 305                         v = v.strip()
 306                         self._info[k] = v
 307                         lastk = k
 308                     elif lastk:
 309                         self._info[lastk] += '\n' + item
 310                     if k == 'content-type':
 311                         self._charset = v.split('charset=')[1]
 312                     elif k == 'plural-forms':
 313                         v = v.split(';')
 314                         plural = v[1].split('plural=')[1]
 315                         self.plural = c2py(plural)
 316             # Note: we unconditionally convert both msgids and msgstrs to
 317             # Unicode using the character encoding specified in the charset
 318             # parameter of the Content-Type header.  The gettext documentation
 319             # strongly encourages msgids to be us-ascii, but some appliations
 320             # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
 321             # traditional gettext applications, the msgid conversion will
 322             # cause no problems since us-ascii should always be a subset of
 323             # the charset encoding.  We may want to fall back to 8-bit msgids
 324             # if the Unicode conversion fails.
 325             if '\x00' in msg:
 326                 # Plural forms
 327                 msgid1, msgid2 = msg.split('\x00')
 328                 tmsg = tmsg.split('\x00')
 329                 if self._charset:
 330                     msgid1 = unicode(msgid1, self._charset)
 331                     tmsg = [unicode(x, self._charset) for x in tmsg]
 332                 for i in range(len(tmsg)):
 333                     catalog[(msgid1, i)] = tmsg[i]
 334             else:
 335                 if self._charset:
 336                     msg = unicode(msg, self._charset)
 337                     tmsg = unicode(tmsg, self._charset)
 338                 catalog[msg] = tmsg
 339             # advance to next entry in the seek tables
 340             masteridx += 8
 341             transidx += 8
 342
 343     def gettext(self, message):
 344         missing = object()
 345         tmsg = self._catalog.get(message, missing)
 346         if tmsg is missing:
 347             if self._fallback:
 348                 return self._fallback.gettext(message)
 349             return message
 350         # Encode the Unicode tmsg back to an 8-bit string, if possible
 351         if self._output_charset:
 352             return tmsg.encode(self._output_charset)
 353         elif self._charset:
 354             return tmsg.encode(self._charset)
 355         return tmsg
 356
 357     def lgettext(self, message):
 358         missing = object()
 359         tmsg = self._catalog.get(message, missing)
 360         if tmsg is missing:
 361             if self._fallback:
 362                 return self._fallback.lgettext(message)
 363             return message
 364         if self._output_charset:
 365             return tmsg.encode(self._output_charset)
 366         return tmsg.encode(locale.getpreferredencoding())
 367
 368     def ngettext(self, msgid1, msgid2, n):
 369         try:
 370             tmsg = self._catalog[(msgid1, self.plural(n))]
 371             if self._output_charset:
 372                 return tmsg.encode(self._output_charset)
 373             elif self._charset:
 374                 return tmsg.encode(self._charset)
 375             return tmsg
 376         except KeyError:
 377             if self._fallback:
 378                 return self._fallback.ngettext(msgid1, msgid2, n)
 379             if n == 1:
 380                 return msgid1
 381             else:
 382                 return msgid2
 383
 384     def lngettext(self, msgid1, msgid2, n):
 385         try:
 386             tmsg = self._catalog[(msgid1, self.plural(n))]
 387             if self._output_charset:
 388                 return tmsg.encode(self._output_charset)
 389             return tmsg.encode(locale.getpreferredencoding())
 390         except KeyError:
 391             if self._fallback:
 392                 return self._fallback.lngettext(msgid1, msgid2, n)
 393             if n == 1:
 394                 return msgid1
 395             else:
 396                 return msgid2
 397
 398     def ugettext(self, message):
 399         missing = object()
 400         tmsg = self._catalog.get(message, missing)
 401         if tmsg is missing:
 402             if self._fallback:
 403                 return self._fallback.ugettext(message)
 404             return unicode(message)
 405         return tmsg
 406
 407     def ungettext(self, msgid1, msgid2, n):
 408         try:
 409             tmsg = self._catalog[(msgid1, self.plural(n))]
 410         except KeyError:
 411             if self._fallback:
 412                 return self._fallback.ungettext(msgid1, msgid2, n)
 413             if n == 1:
 414                 tmsg = unicode(msgid1)
 415             else:
 416                 tmsg = unicode(msgid2)
 417         return tmsg
 418
 419
 420 # Locate a .mo file using the gettext strategy
 421 def find(domain, localedir=None, languages=None, all=0):
 422     # Get some reasonable defaults for arguments that were not supplied
 423     if localedir is None:
 424         localedir = _default_localedir
 425     if languages is None:
 426         languages = []
 427         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
 428             val = os.environ.get(envar)
 429             if val:
 430                 languages = val.split(':')
 431                 break
 432         if 'C' not in languages:
 433             languages.append('C')
 434     # now normalize and expand the languages
 435     nelangs = []
 436     for lang in languages:
 437         for nelang in _expand_lang(lang):
 438             if nelang not in nelangs:
 439                 nelangs.append(nelang)
 440     # select a language
 441     if all:
 442         result = []
 443     else:
 444         result = None
 445     for lang in nelangs:
 446         if lang == 'C':
 447             break
 448         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
 449         if os.path.exists(mofile):
 450             if all:
 451                 result.append(mofile)
 452             else:
 453                 return mofile
 454     return result
 455
 456
 457
 458 # a mapping between absolute .mo file path and Translation object
 459 _translations = {}
 460
 461 def translation(domain, localedir=None, languages=None,
 462                 class_=None, fallback=False, codeset=None):
 463     if class_ is None:
 464         class_ = GNUTranslations
 465     mofiles = find(domain, localedir, languages, all=1)
 466     if not mofiles:
 467         if fallback:
 468             return NullTranslations()
 469         raise IOError(ENOENT, 'No translation file found for domain', domain)
 470     # TBD: do we need to worry about the file pointer getting collected?
 471     # Avoid opening, reading, and parsing the .mo file after it's been done
 472     # once.
 473     result = None
 474     for mofile in mofiles:
 475         key = os.path.abspath(mofile)
 476         t = _translations.get(key)
 477         if t is None:
 478             t = _translations.setdefault(key, class_(open(mofile, 'rb')))
 479         # Copy the translation object to allow setting fallbacks and
 480         # output charset. All other instance data is shared with the
 481         # cached object.
 482         t = copy.copy(t)
 483         if codeset:
 484             t.set_output_charset(codeset)
 485         if result is None:
 486             result = t
 487         else:
 488             result.add_fallback(t)
 489     return result
 490
 491
 492 def install(domain, localedir=None, unicode=False, codeset=None, names=None):
 493     t = translation(domain, localedir, fallback=True, codeset=codeset)
 494     t.install(unicode, names)
 495
 496
 497
 498 # a mapping b/w domains and locale directories
 499 _localedirs = {}
 500 # a mapping b/w domains and codesets
 501 _localecodesets = {}
 502 # current global domain, `messages' used for compatibility w/ GNU gettext
 503 _current_domain = 'messages'
 504
 505
 506 def textdomain(domain=None):
 507     global _current_domain
 508     if domain is not None:
 509         _current_domain = domain
 510     return _current_domain
 511
 512
 513 def bindtextdomain(domain, localedir=None):
 514     global _localedirs
 515     if localedir is not None:
 516         _localedirs[domain] = localedir
 517     return _localedirs.get(domain, _default_localedir)
 518
 519
 520 def bind_textdomain_codeset(domain, codeset=None):
 521     global _localecodesets
 522     if codeset is not None:
 523         _localecodesets[domain] = codeset
 524     return _localecodesets.get(domain)
 525
 526
 527 def dgettext(domain, message):
 528     try:
 529         t = translation(domain, _localedirs.get(domain, None),
 530                         codeset=_localecodesets.get(domain))
 531     except IOError:
 532         return message
 533     return t.gettext(message)
 534
 535 def ldgettext(domain, message):
 536     try:
 537         t = translation(domain, _localedirs.get(domain, None),
 538                         codeset=_localecodesets.get(domain))
 539     except IOError:
 540         return message
 541     return t.lgettext(message)
 542
 543 def dngettext(domain, msgid1, msgid2, n):
 544     try:
 545         t = translation(domain, _localedirs.get(domain, None),
 546                         codeset=_localecodesets.get(domain))
 547     except IOError:
 548         if n == 1:
 549             return msgid1
 550         else:
 551             return msgid2
 552     return t.ngettext(msgid1, msgid2, n)
 553
 554 def ldngettext(domain, msgid1, msgid2, n):
 555     try:
 556         t = translation(domain, _localedirs.get(domain, None),
 557                         codeset=_localecodesets.get(domain))
 558     except IOError:
 559         if n == 1:
 560             return msgid1
 561         else:
 562             return msgid2
 563     return t.lngettext(msgid1, msgid2, n)
 564
 565 def gettext(message):
 566     return dgettext(_current_domain, message)
 567
 568 def lgettext(message):
 569     return ldgettext(_current_domain, message)
 570
 571 def ngettext(msgid1, msgid2, n):
 572     return dngettext(_current_domain, msgid1, msgid2, n)
 573
 574 def lngettext(msgid1, msgid2, n):
 575     return ldngettext(_current_domain, msgid1, msgid2, n)
 576
 577 # dcgettext() has been deemed unnecessary and is not implemented.
 578
 579 # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
 580 # was:
 581 #
 582 #    import gettext
 583 #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
 584 #    _ = cat.gettext
 585 #    print _('Hello World')
 586
 587 # The resulting catalog object currently don't support access through a
 588 # dictionary API, which was supported (but apparently unused) in GNOME
 589 # gettext.
 590
 591 Catalog = translation