Updated Arabic Translation by Djihed Afifi.
[straw.git] / tools / pygettext.py
blob197a89b3871af0cad225c2ac1af7d5d189cf6781
1 #! /usr/bin/env python
2 # -*- coding: iso-8859-1 -*-
3 # Originally written by Barry Warsaw <barry@zope.com>
5 # Minimally patched to make it even more xgettext compatible
6 # by Peter Funk <pf@artcom-gmbh.de>
8 # 2002-11-22 Jürgen Hermann <jh@web.de>
9 # Added checks that _() only contains string literals, and
10 # command line args are resolved to module lists, i.e. you
11 # can now pass a filename, a module or package name, or a
12 # directory (including globbing chars, important for Win32).
13 # Made docstring fit in 80 chars wide displays using pydoc.
16 # for selftesting
17 try:
18 import fintl
19 _ = fintl.gettext
20 except ImportError:
21 _ = lambda s: s
23 __doc__ = _("""pygettext -- Python equivalent of xgettext(1)
25 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
26 internationalization of C programs. Most of these tools are independent of
27 the programming language and can be used from within Python programs.
28 Martin von Loewis' work[1] helps considerably in this regard.
30 There's one problem though; xgettext is the program that scans source code
31 looking for message strings, but it groks only C (or C++). Python
32 introduces a few wrinkles, such as dual quoting characters, triple quoted
33 strings, and raw strings. xgettext understands none of this.
35 Enter pygettext, which uses Python's standard tokenize module to scan
36 Python source code, generating .pot files identical to what GNU xgettext[2]
37 generates for C and C++ code. From there, the standard GNU tools can be
38 used.
40 A word about marking Python strings as candidates for translation. GNU
41 xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
42 and gettext_noop. But those can be a lot of text to include all over your
43 code. C and C++ have a trick: they use the C preprocessor. Most
44 internationalized C source includes a #define for gettext() to _() so that
45 what has to be written in the source is much less. Thus these are both
46 translatable strings:
48 gettext("Translatable String")
49 _("Translatable String")
51 Python of course has no preprocessor so this doesn't work so well. Thus,
52 pygettext searches only for _() by default, but see the -k/--keyword flag
53 below for how to augment this.
55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
56 [2] http://www.gnu.org/software/gettext/gettext.html
58 NOTE: pygettext attempts to be option and feature compatible with GNU
59 xgettext where ever possible. However some options are still missing or are
60 not fully implemented. Also, xgettext's use of command line switches with
61 option arguments is broken, and in these cases, pygettext just defines
62 additional switches.
64 Usage: pygettext [options] inputfile ...
66 Options:
69 --extract-all
70 Extract all strings.
72 -d name
73 --default-domain=name
74 Rename the default output file from messages.pot to name.pot.
77 --escape
78 Replace non-ASCII characters with octal escape sequences.
81 --docstrings
82 Extract module, class, method, and function docstrings. These do
83 not need to be wrapped in _() markers, and in fact cannot be for
84 Python to consider them docstrings. (See also the -X option).
87 --help
88 Print this help message and exit.
90 -k word
91 --keyword=word
92 Keywords to look for in addition to the default set, which are:
93 %(DEFAULTKEYWORDS)s
95 You can have multiple -k flags on the command line.
98 --no-default-keywords
99 Disable the default set of keywords (see above). Any keywords
100 explicitly added with the -k/--keyword option are still recognized.
102 --no-location
103 Do not write filename/lineno location comments.
106 --add-location
107 Write filename/lineno location comments indicating where each
108 extracted string is found in the source. These lines appear before
109 each msgid. The style of comments is controlled by the -S/--style
110 option. This is the default.
112 -o filename
113 --output=filename
114 Rename the default output file from messages.pot to filename. If
115 filename is `-' then the output is sent to standard out.
117 -p dir
118 --output-dir=dir
119 Output files will be placed in directory dir.
121 -S stylename
122 --style stylename
123 Specify which style to use for location comments. Two styles are
124 supported:
126 Solaris # File: filename, line: line-number
127 GNU #: filename:line
129 The style name is case insensitive. GNU style is the default.
132 --verbose
133 Print the names of the files being processed.
136 --version
137 Print the version of pygettext and exit.
139 -w columns
140 --width=columns
141 Set width of output to columns.
143 -x filename
144 --exclude-file=filename
145 Specify a file that contains a list of strings that are not be
146 extracted from the input files. Each string to be excluded must
147 appear on a line by itself in the file.
149 -X filename
150 --no-docstrings=filename
151 Specify a file that contains a list of files (one per line) that
152 should not have their docstrings extracted. This is only useful in
153 conjunction with the -D option above.
155 If `inputfile' is -, standard input is read.
156 """)
158 import os
159 import re
160 import imp
161 import sys
162 import glob
163 import time
164 import getopt
165 import token
166 import tokenize
167 import operator
169 __version__ = '1.5'
171 default_keywords = ['_']
172 DEFAULTKEYWORDS = ', '.join(default_keywords)
174 EMPTYSTRING = ''
178 # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
179 # there.
180 pot_header = _('''\
181 # SOME DESCRIPTIVE TITLE.
182 # Copyright (C) YEAR ORGANIZATION
183 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
185 msgid ""
186 msgstr ""
187 "Project-Id-Version: PACKAGE VERSION\\n"
188 "POT-Creation-Date: %(time)s\\n"
189 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
190 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
191 "Language-Team: LANGUAGE <LL@li.org>\\n"
192 "MIME-Version: 1.0\\n"
193 "Content-Type: text/plain; charset=CHARSET\\n"
194 "Content-Transfer-Encoding: 8bit\\n"
195 "Generated-By: pygettext.py %(version)s\\n"
196 ''')
199 def usage(code, msg=''):
200 print >> sys.stderr, __doc__ % globals()
201 if msg:
202 print >> sys.stderr, msg
203 sys.exit(code)
207 escapes = []
209 def make_escapes(pass_iso8859):
210 global escapes
211 if pass_iso8859:
212 # Allow iso-8859 characters to pass through so that e.g. 'msgid
213 # "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
214 # escape any character outside the 32..126 range.
215 mod = 128
216 else:
217 mod = 256
218 for i in range(256):
219 if 32 <= (i % mod) <= 126:
220 escapes.append(chr(i))
221 else:
222 escapes.append("\\%03o" % i)
223 escapes[ord('\\')] = '\\\\'
224 escapes[ord('\t')] = '\\t'
225 escapes[ord('\r')] = '\\r'
226 escapes[ord('\n')] = '\\n'
227 escapes[ord('\"')] = '\\"'
230 def escape(s):
231 global escapes
232 s = list(s)
233 for i in range(len(s)):
234 s[i] = escapes[ord(s[i])]
235 return EMPTYSTRING.join(s)
238 def safe_eval(s):
239 # unwrap quotes, safely
240 return eval(s, {'__builtins__':{}}, {})
243 def normalize(s):
244 # This converts the various Python string types into a format that is
245 # appropriate for .po files, namely much closer to C style.
246 lines = s.split('\n')
247 if len(lines) == 1:
248 s = '"' + escape(s) + '"'
249 else:
250 if not lines[-1]:
251 del lines[-1]
252 lines[-1] = lines[-1] + '\n'
253 for i in range(len(lines)):
254 lines[i] = escape(lines[i])
255 lineterm = '\\n"\n"'
256 s = '""\n"' + lineterm.join(lines) + '"'
257 return s
260 def containsAny(str, set):
261 """Check whether 'str' contains ANY of the chars in 'set'"""
262 return 1 in [c in str for c in set]
265 def _visit_pyfiles(list, dirname, names):
266 """Helper for getFilesForName()."""
267 # get extension for python source files
268 if not globals().has_key('_py_ext'):
269 global _py_ext
270 _py_ext = [triple[0] for triple in imp.get_suffixes()
271 if triple[2] == imp.PY_SOURCE][0]
273 # don't recurse into CVS directories
274 if 'CVS' in names:
275 names.remove('CVS')
277 # add all *.py files to list
278 list.extend(
279 [os.path.join(dirname, file) for file in names
280 if os.path.splitext(file)[1] == _py_ext]
284 def _get_modpkg_path(dotted_name, pathlist=None):
285 """Get the filesystem path for a module or a package.
287 Return the file system path to a file for a module, and to a directory for
288 a package. Return None if the name is not found, or is a builtin or
289 extension module.
291 # split off top-most name
292 parts = dotted_name.split('.', 1)
294 if len(parts) > 1:
295 # we have a dotted path, import top-level package
296 try:
297 file, pathname, description = imp.find_module(parts[0], pathlist)
298 if file: file.close()
299 except ImportError:
300 return None
302 # check if it's indeed a package
303 if description[2] == imp.PKG_DIRECTORY:
304 # recursively handle the remaining name parts
305 pathname = _get_modpkg_path(parts[1], [pathname])
306 else:
307 pathname = None
308 else:
309 # plain name
310 try:
311 file, pathname, description = imp.find_module(
312 dotted_name, pathlist)
313 if file:
314 file.close()
315 if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
316 pathname = None
317 except ImportError:
318 pathname = None
320 return pathname
323 def getFilesForName(name):
324 """Get a list of module files for a filename, a module or package name,
325 or a directory.
327 if not os.path.exists(name):
328 # check for glob chars
329 if containsAny(name, "*?[]"):
330 files = glob.glob(name)
331 list = []
332 for file in files:
333 list.extend(getFilesForName(file))
334 return list
336 # try to find module or package
337 name = _get_modpkg_path(name)
338 if not name:
339 return []
341 if os.path.isdir(name):
342 # find all python files in directory
343 list = []
344 os.path.walk(name, _visit_pyfiles, list)
345 return list
346 elif os.path.exists(name):
347 # a single file
348 return [name]
350 return []
353 def isformatstring(s):
354 '''Returns true if string s is a valid format string'''
355 ret = 0
356 fmt = re.sub('\%\([a-zA-Z_]+[a-zA-Z_0-9]*\)', '%', s)
357 for i in range(fmt.count('%') * 3):
358 try:
359 fmt % ((1,) * (i + 1))
360 ret = 1
361 break
362 except:
363 pass
364 return ret
367 class TokenEater:
368 def __init__(self, options):
369 self.__options = options
370 self.__messages = {}
371 self.__state = self.__waiting
372 self.__data = []
373 self.__lineno = -1
374 self.__freshmodule = 1
375 self.__curfile = None
377 def __call__(self, ttype, tstring, stup, etup, line):
378 # dispatch
379 ## import token
380 ## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
381 ## 'tstring:', tstring
382 self.__state(ttype, tstring, stup[0])
384 def __waiting(self, ttype, tstring, lineno):
385 opts = self.__options
386 # Do docstring extractions, if enabled
387 if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
388 # module docstring?
389 if self.__freshmodule:
390 if ttype == tokenize.STRING:
391 self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
392 self.__freshmodule = 0
393 elif ttype not in (tokenize.COMMENT, tokenize.NL):
394 self.__freshmodule = 0
395 return
396 # class docstring?
397 if ttype == tokenize.NAME and tstring in ('class', 'def'):
398 self.__state = self.__suiteseen
399 return
400 if ttype == tokenize.NAME and tstring in opts.keywords:
401 self.__state = self.__keywordseen
403 def __suiteseen(self, ttype, tstring, lineno):
404 # ignore anything until we see the colon
405 if ttype == tokenize.OP and tstring == ':':
406 self.__state = self.__suitedocstring
408 def __suitedocstring(self, ttype, tstring, lineno):
409 # ignore any intervening noise
410 if ttype == tokenize.STRING:
411 self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
412 self.__state = self.__waiting
413 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
414 tokenize.COMMENT):
415 # there was no class docstring
416 self.__state = self.__waiting
418 def __keywordseen(self, ttype, tstring, lineno):
419 if ttype == tokenize.OP and tstring == '(':
420 self.__data = []
421 self.__lineno = lineno
422 self.__state = self.__openseen
423 else:
424 self.__state = self.__waiting
426 def __openseen(self, ttype, tstring, lineno):
427 if ttype == tokenize.OP and tstring == ')':
428 # We've seen the last of the translatable strings. Record the
429 # line number of the first line of the strings and update the list
430 # of messages seen. Reset state for the next batch. If there
431 # were no strings inside _(), then just ignore this entry.
432 if self.__data:
433 self.__addentry(EMPTYSTRING.join(self.__data))
434 self.__state = self.__waiting
435 elif ttype == tokenize.STRING:
436 self.__data.append(safe_eval(tstring))
437 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
438 token.NEWLINE, tokenize.NL]:
439 # warn if we see anything else than STRING or whitespace
440 print >> sys.stderr, _(
441 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
442 ) % {
443 'token': tstring,
444 'file': self.__curfile,
445 'lineno': self.__lineno
447 self.__state = self.__waiting
449 def __addentry(self, msg, lineno=None, isdocstring=0):
450 if lineno is None:
451 lineno = self.__lineno
452 if not msg in self.__options.toexclude:
453 entry = (self.__curfile, lineno)
454 self.__messages.setdefault(msg, {})[entry] = isdocstring
456 def set_filename(self, filename):
457 self.__curfile = filename
458 self.__freshmodule = 1
460 def write(self, fp):
461 options = self.__options
462 timestamp = time.ctime(time.time())
463 # The time stamp in the header doesn't have the same format as that
464 # generated by xgettext...
465 print >> fp, pot_header % {'time': timestamp, 'version': __version__}
466 # Sort the entries. First sort each particular entry's keys, then
467 # sort all the entries by their first item.
468 reverse = {}
469 for k, v in self.__messages.items():
470 keys = v.keys()
471 keys.sort()
472 reverse.setdefault(tuple(keys), []).append((k, v))
473 rkeys = reverse.keys()
474 rkeys.sort()
475 for rkey in rkeys:
476 rentries = reverse[rkey]
477 rentries.sort()
478 for k, v in rentries:
479 isdocstring = 0
480 # If the entry was gleaned out of a docstring, then add a
481 # comment stating so. This is to aid translators who may wish
482 # to skip translating some unimportant docstrings.
483 if reduce(operator.__add__, v.values()):
484 isdocstring = 1
485 # k is the message string, v is a dictionary-set of (filename,
486 # lineno) tuples. We want to sort the entries in v first by
487 # file name and then by line number.
488 v = v.keys()
489 v.sort()
490 if not options.writelocations:
491 pass
492 # location comments are different b/w Solaris and GNU:
493 elif options.locationstyle == options.SOLARIS:
494 for filename, lineno in v:
495 d = {'filename': filename, 'lineno': lineno}
496 print >>fp, _(
497 '# File: %(filename)s, line: %(lineno)d') % d
498 elif options.locationstyle == options.GNU:
499 # fit as many locations on one line, as long as the
500 # resulting line length doesn't exceeds 'options.width'
501 locline = '#:'
502 for filename, lineno in v:
503 d = {'filename': filename, 'lineno': lineno}
504 s = _(' %(filename)s:%(lineno)d') % d
505 if len(locline) + len(s) <= options.width:
506 locline = locline + s
507 else:
508 print >> fp, locline
509 locline = "#:" + s
510 if len(locline) > 2:
511 print >> fp, locline
513 if isformatstring(k) and isdocstring:
514 print >> fp, '#, python-format, docstring'
515 elif isdocstring:
516 print >> fp, '#, docstring'
517 elif isformatstring(k):
518 print >> fp, '#, python-format'
519 print >> fp, 'msgid', normalize(k)
520 print >> fp, 'msgstr ""\n'
524 def main(files = [], **kwds):
525 global default_keywords
526 if files:
527 args = files
528 opts = []
529 else:
530 try:
531 opts, args = getopt.getopt(
532 sys.argv[1:],
533 'ad:DEhk:Kno:p:S:Vvw:x:X:',
534 ['extract-all', 'default-domain=', 'escape', 'help',
535 'keyword=', 'no-default-keywords',
536 'add-location', 'no-location', 'output=', 'output-dir=',
537 'style=', 'verbose', 'version', 'width=', 'exclude-file=',
538 'docstrings', 'no-docstrings',
540 except getopt.error, msg:
541 usage(1, msg)
543 # for holding option values
544 class Options:
545 # constants
546 GNU = 1
547 SOLARIS = 2
548 # defaults
549 extractall = 0 # FIXME: currently this option has no effect at all.
550 escape = kwds.get('escape', 0)
551 keywords = kwds.get('keywords', [])
552 outpath = kwds.get('outpath', '')
553 outfile = kwds.get('outfile', 'messages.pot')
554 writelocations = kwds.get('writelocations', 1)
555 locationstyle = kwds.get('locationstyle', GNU)
556 verbose = kwds.get('verbose', 0)
557 width = kwds.get('width', 78)
558 excludefilename = kwds.get('excludefilename', '')
559 docstrings = kwds.get('docstrings', 0)
560 nodocstrings = kwds.get('nodocstrings', {})
561 nodocstringsfile = kwds.get('nodocstrings_file', '')
562 default_keywords = default_keywords[:]
564 options = Options()
565 if kwds.get('no_default_keywords', False):
566 options.default_keywords = []
567 locations = {'gnu' : options.GNU,
568 'solaris' : options.SOLARIS,
571 # parse options
572 for opt, arg in opts:
573 if opt in ('-h', '--help'):
574 usage(0)
575 elif opt in ('-a', '--extract-all'):
576 options.extractall = 1
577 elif opt in ('-d', '--default-domain'):
578 options.outfile = arg + '.pot'
579 elif opt in ('-E', '--escape'):
580 options.escape = 1
581 elif opt in ('-D', '--docstrings'):
582 options.docstrings = 1
583 elif opt in ('-k', '--keyword'):
584 options.keywords.append(arg)
585 elif opt in ('-K', '--no-default-keywords'):
586 options.default_keywords = []
587 elif opt in ('-n', '--add-location'):
588 options.writelocations = 1
589 elif opt in ('--no-location',):
590 options.writelocations = 0
591 elif opt in ('-S', '--style'):
592 options.locationstyle = locations.get(arg.lower())
593 if options.locationstyle is None:
594 usage(1, _('Invalid value for --style: %s') % arg)
595 elif opt in ('-o', '--output'):
596 options.outfile = arg
597 elif opt in ('-p', '--output-dir'):
598 options.outpath = arg
599 elif opt in ('-v', '--verbose'):
600 options.verbose = 1
601 elif opt in ('-V', '--version'):
602 print _('pygettext.py (xgettext for Python) %s') % __version__
603 sys.exit(0)
604 elif opt in ('-w', '--width'):
605 try:
606 options.width = int(arg)
607 except ValueError:
608 usage(1, _('--width argument must be an integer: %s') % arg)
609 elif opt in ('-x', '--exclude-file'):
610 options.excludefilename = arg
611 elif opt in ('-X', '--no-docstrings'):
612 options.nodocstringsfile = arg
614 if options.nodocstringsfile:
615 fp = open(options.nodocstringsfile)
616 try:
617 while 1:
618 line = fp.readline()
619 if not line:
620 break
621 options.nodocstrings[line[:-1]] = 1
622 finally:
623 fp.close()
625 # calculate escapes
626 make_escapes(options.escape)
628 # calculate all keywords
629 options.keywords.extend(options.default_keywords)
631 # initialize list of strings to exclude
632 if options.excludefilename:
633 try:
634 fp = open(options.excludefilename)
635 options.toexclude = fp.readlines()
636 fp.close()
637 except IOError:
638 print >> sys.stderr, _(
639 "Can't read --exclude-file: %s") % options.excludefilename
640 sys.exit(1)
641 else:
642 options.toexclude = []
644 # resolve args to module lists
645 expanded = []
646 for arg in args:
647 if arg == '-':
648 expanded.append(arg)
649 else:
650 expanded.extend(getFilesForName(arg))
651 args = expanded
653 # slurp through all the files
654 eater = TokenEater(options)
655 for filename in args:
656 if filename == '-':
657 if options.verbose:
658 print _('Reading standard input')
659 fp = sys.stdin
660 closep = 0
661 else:
662 if options.verbose:
663 print _('Working on %s') % filename
664 fp = open(filename)
665 closep = 1
666 try:
667 eater.set_filename(filename)
668 try:
669 tokenize.tokenize(fp.readline, eater)
670 except tokenize.TokenError, e:
671 print >> sys.stderr, '%s: %s, line %d, column %d' % (
672 e[0], filename, e[1][0], e[1][1])
673 finally:
674 if closep:
675 fp.close()
677 # write the output
678 if options.outfile == '-':
679 fp = sys.stdout
680 closep = 0
681 else:
682 if options.outpath:
683 options.outfile = os.path.join(options.outpath, options.outfile)
684 fp = open(options.outfile, 'w')
685 closep = 1
686 try:
687 eater.write(fp)
688 finally:
689 if closep:
690 fp.close()
693 if __name__ == '__main__':
694 main()
695 # some more test strings
696 _(u'a unicode string')
697 # this one creates a warning
698 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
699 _('more' 'than' 'one' 'string')