5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
10 use lib
$FindBin::Bin
;
20 use vars
qw( $convert_from );
21 use vars qw( $files_from $directory $output $sort );
22 use vars qw( $extract_all_p );
23 use vars qw( $pedantic_p );
24 use vars qw( %text %translation );
25 use vars qw( $charset_in $charset_out );
26 use vars qw( $disable_fuzzy_p );
27 use vars qw( $verbose_p );
28 use vars qw( $po_mode_p );
32 ###############################################################################
34 sub string_negligible_p {
35 my($t) = @_; # a string
36 # Don't emit pure whitespace, pure numbers, pure punctuation,
37 # single letters, or TMPL_VAR's.
38 # Punctuation should arguably be translated. But without context
39 # they are untranslatable. Note that $t is a string, not a token object.
40 return !$extract_all_p && (
41 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
42 || $t =~ /^\d+$/ # purely digits
43 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
44 || $t =~ /^[A-Za-z]$/ # single letters
45 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
46 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
50 sub token_negligible_p {
53 return !$extract_all_p && (
54 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
55 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
56 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
61 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
62 : $t == C4::TmplTokenType::TAG() ? ''
63 : token_negligible_p($_) ? ''
70 ###############################################################################
73 my($token, $string) = @_;
74 # If we determine that the string is negligible, don't bother to remember
75 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
76 my $key = TmplTokenizer::string_canon( $string );
77 $text{$key} = [] unless defined $text{$key};
78 push @{$text{$key}}, $token;
82 ###############################################################################
86 # The real gettext tools seems to sort case sensitively; I don't know why
87 @t = sort { $a cmp $b } @t if $sort eq 's';
89 my @aa = sort { $a->pathname cmp $b->pathname
90 || $a->line_number <=> $b->line_number } @{$text{$a}};
91 my @bb = sort { $a->pathname cmp $b->pathname
92 || $a->line_number <=> $b->line_number } @{$text{$b}};
93 $aa[0]->pathname cmp $bb[0]->pathname
94 || $aa[0]->line_number <=> $bb[0]->line_number;
99 ###############################################################################
104 my $s = TmplTokenizer::next_token $h;
105 last unless defined $s;
106 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
107 if ($kind eq C4::TmplTokenType::TEXT) {
108 if ($t =~ /\S/s && $t !~ /<!/){
111 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
112 if ($s->form =~ /\S/s && $s->form !~ /<!/){
113 remember( $s, $s->form );
115 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
116 # value [tag=input], meta
118 $tag = lc($1) if $t =~ /^<(\S+)/s;
119 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
121 next if $a eq 'label' && $tag ne 'optgroup';
122 next if $a eq 'content' && $tag ne 'meta';
123 next if $a eq 'value' && ($tag ne 'input'
124 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
125 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
126 $val = TmplTokenizer::trim $val;
127 remember( $s, $val ) if $val =~ /\S/s;
130 } elsif ($s->has_js_data) {
131 for my $t (@{$s->js_data}) {
132 remember( $s, $t->[3] ) if $t->[0]; # FIXME
138 ###############################################################################
140 sub generate_strings_list {
141 # Emit all extracted strings.
142 for my $t (string_list) {
143 printf $OUTPUT "%s\n", $t;
147 ###############################################################################
149 sub generate_po_file {
150 # We don't emit the Plural-Forms header; it's meaningless for us
151 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
152 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
153 # Time stamps aren't exactly right semantically. I don't know how to fix it.
154 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
155 my $time_pot = $time;
156 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
158 # SOME DESCRIPTIVE TITLE.
159 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
160 # This file is distributed under the same license as the PACKAGE package.
161 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
164 print $OUTPUT <<EOF unless $disable_fuzzy_p;
170 "Project-Id-Version: PACKAGE VERSION\\n"
171 "POT-Creation-Date: $time_pot\\n"
172 "PO-Revision-Date: $time_po\\n"
173 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
174 "Language-Team: LANGUAGE <LL\@li.org>\\n"
175 "MIME-Version: 1.0\\n"
176 "Content-Type: text/plain; charset=$pot_charset\\n"
177 "Content-Transfer-Encoding: 8bit\\n"
180 my $directory_re = quotemeta("$directory/");
181 for my $t (string_list
) {
182 if ($text{$t}->[0]->type == C4
::TmplTokenType
::TEXT_PARAMETRIZED
) {
183 my($token, $n) = ($text{$t}->[0], 0);
184 printf $OUTPUT "#. For the first occurrence,\n"
185 if @
{$text{$t}} > 1 && $token->parameters_and_fields > 0;
186 for my $param ($token->parameters_and_fields) {
188 my $type = $param->type;
189 my $subtype = ($type == C4
::TmplTokenType
::TAG
190 && $param->string =~ /^<input\b/is?
191 $param->attributes->{'type'}->[1]: undef);
192 my $fmt = TmplTokenizer
::_formalize
( $param );
194 if ($type == C4
::TmplTokenType
::DIRECTIVE
) {
195 # $type = "Template::Toolkit Directive";
196 $type = $param->string =~ /\[%(.*?)%\]/is?
$1: 'ERROR';
197 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
199 printf $OUTPUT "#. %s: %s\n", $fmt,
200 "$type" . (defined $name?
" name=$name": '');
202 my $name = $param->attributes->{'name'};
204 $value = $param->attributes->{'value'}
205 unless $subtype =~ /^(?:text)$/;
206 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
207 . (defined $name?
" name=$name->[1]": '')
208 . (defined $value?
" value=$value->[1]": '');
211 } elsif ($text{$t}->[0]->type == C4
::TmplTokenType
::TAG
) {
212 my($token) = ($text{$t}->[0]);
213 printf $OUTPUT "#. For the first occurrence,\n"
214 if @
{$text{$t}} > 1 && $token->parameters_and_fields > 0;
215 if ($token->string =~ /^<meta\b/is) {
216 my $type = $token->attributes->{'http-equiv'}->[1];
217 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
218 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
220 my $type = (lc($tag) eq 'input'?
221 $token->attributes->{'type'}: undef);
222 my $name = $token->attributes->{'name'};
223 printf $OUTPUT "#. %s\n", $tag
224 . (defined $type?
" type=$type->[1]": '')
225 . (defined $name?
" name=$name->[1]": '');
227 } elsif ($text{$t}->[0]->has_js_data) {
228 printf $OUTPUT "#. For the first occurrence,\n" if @
{$text{$t}} > 1;
229 printf $OUTPUT "#. SCRIPT\n";
232 for my $token (@
{$text{$t}}) {
233 my $pathname = $token->pathname;
234 $pathname =~ s/^$directory_re//os;
235 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
236 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
237 if defined $pathname && defined $token->line_number;
238 $cformat_p = 1 if $token->type == C4
::TmplTokenType
::TEXT_PARAMETRIZED
;
240 printf $OUTPUT "#, c-format\n" if $cformat_p;
241 printf $OUTPUT "msgid %s\n", TmplTokenizer
::quote_po
242 TmplTokenizer
::string_canon
243 TmplTokenizer
::charset_convert
$t, $charset_in, $charset_out;
244 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
245 TmplTokenizer
::quote_po
( $translation{$t} ): "\"\"");
249 ###############################################################################
251 sub convert_translation_file
{
252 open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
253 VerboseWarnings
::set_input_file_name
$convert_from;
256 my($msgid, $msgstr) = split(/\t/);
257 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
258 unless defined $msgstr;
260 # Fixup some of the bad strings
261 $msgid =~ s/^SELECTED>//;
264 my $token = TmplToken
->new( $msgid, C4
::TmplTokenType
::UNKNOWN
, undef, undef );
265 remember
( $token, $msgid );
266 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
267 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
269 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
270 my $candidate = TmplTokenizer
::charset_canon
$2;
271 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
272 if defined $charset_in && $charset_in ne $candidate;
273 $charset_in = $candidate;
275 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
276 my $candidate = TmplTokenizer
::charset_canon
$2;
277 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
278 if defined $charset_out && $charset_out ne $candidate;
279 $charset_out = $candidate;
282 # The following assumption is correct; that's what HTML::Template assumes
283 if (!defined $charset_in) {
284 $charset_in = $charset_out = TmplTokenizer
::charset_canon
'utf-8';
285 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
289 ###############################################################################
293 my $h = $exitcode?
*STDERR
: *STDOUT
;
296 Extract translatable strings from given HTML::Template input files.
299 -f, --files-from=FILE Get list of input files from FILE
300 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
302 Output file location:
303 -o, --output=FILE Write output to specified file
305 HTML::Template options:
306 -a, --extract-all Extract all strings
307 --pedantic-warnings Issue warnings even for detected problems
308 which are likely to be harmless
311 -s, --sort-output generate sorted output
312 -F, --sort-by-file sort output by file location
313 -v, --verbose explain what is being done
316 --help Display this help and exit
318 Try `perldoc $0' for perhaps more information.
323 ###############################################################################
326 print STDERR
"$_[0]\n" if @_;
327 print STDERR
"Try `$0 --help' for more information.\n";
331 ###############################################################################
333 Getopt
::Long
::config
qw( bundling no_auto_abbrev );
335 'a|extract-all' => \
$extract_all_p,
336 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
337 'convert-from=s' => \
$convert_from,
338 'D|directory=s' => \
$directory,
339 'disable-fuzzy' => \
$disable_fuzzy_p, # INTERNAL
340 'f|files-from=s' => \
$files_from,
341 'I|input-charset=s' => \
$charset_in, # INTERNAL
342 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
343 'O|output-charset=s' => \
$charset_out, # INTERNAL
344 'output|o=s' => \
$output,
345 'po-mode' => \
$po_mode_p, # INTERNAL
346 's|sort-output' => sub { $sort = 's' },
347 'F|sort-by-file' => sub { $sort = 'F' },
348 'v|verbose' => \
$verbose_p,
349 'help' => sub { usage
(0) },
352 VerboseWarnings
::set_application_name
$0;
353 VerboseWarnings
::set_pedantic_mode
$pedantic_p;
355 usage_error
('Missing mandatory option -f')
356 unless defined $files_from || defined $convert_from;
357 $directory = '.' unless defined $directory;
359 usage_error
('You cannot specify both --convert-from and --files-from')
360 if defined $convert_from && defined $files_from;
362 if (defined $output && $output ne '-') {
363 print STDERR
"$0: Opening output file \"$output\"\n" if $verbose_p;
364 open($OUTPUT, '>', $output) || die "$output: $!\n";
366 print STDERR
"$0: Outputting to STDOUT...\n" if $verbose_p;
367 open($OUTPUT, ">&STDOUT");
370 if (defined $files_from) {
371 print STDERR
"$0: Opening input file list \"$files_from\"\n" if $verbose_p;
372 open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
375 my $input = /^\//? $_: "$directory/$_";
376 my $h = TmplTokenizer->new( $input );
377 $h->set_allow_cformat( 1 );
378 VerboseWarnings::set_input_file_name $input;
379 print STDERR "$0: Processing file
\"$input\"\n" if $verbose_p;
384 print STDERR "$0: Converting
\"$convert_from\"\n" if $verbose_p;
385 convert_translation_file;
389 warn "This input will
not work with Mozilla standards
-compliant mode
\n", undef
390 if TmplTokenizer::syntaxerror_p;
393 exit(-1) if TmplTokenizer::fatal_p;
395 ###############################################################################
399 This script has behaviour similar to
400 xgettext(1), and generates gettext-compatible output files.
402 A gettext-like format provides the following advantages:
408 Translation to non-English-like languages with different word
409 order: gettext's c-format strings can theoretically be
410 emulated if we are able to do some analysis on the .tt input
411 and treat <TMPL_VAR> in a way similar to %s.
415 Context for the extracted strings: the gettext format provides
416 the filenames and line numbers where each string can be found.
417 The translator can read the source file and see the context,
418 in case the string by itself can mean several different things.
422 Place for the translator to add comments about the translations.
426 Gettext-compatible tools, if any, might be usable if we adopt
431 This script has already been in use for over a year and should
432 be reasonable stable. Nevertheless, it is still somewhat
433 experimental and there are still some issues.
435 Please refer to the explanation in tmpl_process3 for further
438 If you want to generate GNOME-style POTFILES.in files, such
439 files (passed to -f) can be generated thus:
441 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
442 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
443 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
444 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
446 This is, however, quite pointless, because the "create
" and
447 "update
" actions have already been implemented in tmpl_process3.pl.
449 =head2 Strings inside JavaScript
451 In the SCRIPT elements, the script will attempt to scan for
452 _("I
<string literal
>") patterns, and extract the I<string literal>
453 as a translatable string.
455 Note that the C-like _(...) notation is required.
457 The JavaScript must actually define a _ function
458 so that the code remains correct JavaScript.
459 A suitable definition of such a function can be
461 function _(s) { return s } // dummy function for gettext
472 There probably are some. Bugs related to scanning of <INPUT>
473 tags seem to be especially likely to be present.
475 Its diagnostics are probably too verbose.
477 When a <TMPL_VAR> within a JavaScript-related attribute is
478 detected, the script currently displays no warnings at all.
479 It might be good to display some kind of warning.
481 Its sort order (-s option) seems to be different than the real
482 xgettext(1)'s sort option. This will result in translation
483 strings inside the generated PO file spuriously moving about
484 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
486 If a Javascript string has leading spaces, it will
487 generate strings with spurious leading spaces,
488 leading to failure to match the strings when actually generating