Bug 18499: (follow-up) Add missing filters
[koha.git] / misc / translator / xgettext.pl
blobf7b42e589a64d307911c330c94782997a6913ded
1 #!/usr/bin/perl
3 =head1 NAME
5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
7 =cut
9 use FindBin;
10 use lib $FindBin::Bin;
12 use strict;
13 use warnings;
14 use Getopt::Long;
15 use POSIX;
16 use Locale::PO;
17 use TmplTokenizer;
18 use VerboseWarnings;
20 use vars qw( $convert_from );
21 use vars qw( $files_from $directory $output $sort );
22 use vars qw( $extract_all_p );
23 use vars qw( $pedantic_p );
24 use vars qw( %text %translation );
25 use vars qw( $charset_in $charset_out );
26 use vars qw( $disable_fuzzy_p );
27 use vars qw( $verbose_p );
28 use vars qw( $po_mode_p );
30 our $OUTPUT;
32 ###############################################################################
34 sub string_negligible_p {
35 my($t) = @_; # a string
36 # Don't emit pure whitespace, pure numbers, pure punctuation,
37 # single letters, or TMPL_VAR's.
38 # Punctuation should arguably be translated. But without context
39 # they are untranslatable. Note that $t is a string, not a token object.
40 return !$extract_all_p && (
41 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
42 || $t =~ /^\d+$/ # purely digits
43 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
44 || $t =~ /^[A-Za-z]$/ # single letters
45 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
46 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
50 sub token_negligible_p {
51 my ($x) = @_;
52 my $t = $x->type;
53 return !$extract_all_p && (
54 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
55 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
56 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
57 && join(
58 '',
59 map {
60 my $t = $_->type;
61 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
62 : $t == C4::TmplTokenType::TAG() ? ''
63 : token_negligible_p($_) ? ''
64 : '1'
65 } @{ $x->children }
66 ) eq ''
70 ###############################################################################
72 sub remember {
73 my($token, $string) = @_;
74 # If we determine that the string is negligible, don't bother to remember
75 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
76 my $key = TmplTokenizer::string_canon( $string );
77 $text{$key} = [] unless defined $text{$key};
78 push @{$text{$key}}, $token;
82 ###############################################################################
84 sub string_list {
85 my @t = keys %text;
86 # The real gettext tools seems to sort case sensitively; I don't know why
87 @t = sort { $a cmp $b } @t if $sort eq 's';
88 @t = sort {
89 my @aa = sort { $a->pathname cmp $b->pathname
90 || $a->line_number <=> $b->line_number } @{$text{$a}};
91 my @bb = sort { $a->pathname cmp $b->pathname
92 || $a->line_number <=> $b->line_number } @{$text{$b}};
93 $aa[0]->pathname cmp $bb[0]->pathname
94 || $aa[0]->line_number <=> $bb[0]->line_number;
95 } @t if $sort eq 'F';
96 return @t;
99 ###############################################################################
101 sub text_extract {
102 my($h) = @_;
103 for (;;) {
104 my $s = TmplTokenizer::next_token $h;
105 last unless defined $s;
106 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
107 if ($kind eq C4::TmplTokenType::TEXT) {
108 if ($t =~ /\S/s && $t !~ /<!/){
109 remember( $s, $t );
111 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
112 if ($s->form =~ /\S/s && $s->form !~ /<!/){
113 remember( $s, $s->form );
115 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
116 # value [tag=input], meta
117 my $tag;
118 $tag = lc($1) if $t =~ /^<(\S+)/s;
119 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
120 if ($attr->{$a}) {
121 next if $a eq 'label' && $tag ne 'optgroup';
122 next if $a eq 'content' && $tag ne 'meta';
123 next if $a eq 'value' && ($tag ne 'input'
124 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
125 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
126 $val = TmplTokenizer::trim $val;
127 # for selected attributes replace '[%..%]' with '%s' globally
128 if ( $a =~ /title|value|alt|content|placeholder/ ) {
129 $val =~ s/\[\%.*?\%\]/\%s/g;
131 # save attribute text for translation
132 remember( $s, $val ) if $val =~ /\S/s;
135 } elsif ($s->has_js_data) {
136 for my $t (@{$s->js_data}) {
137 remember( $s, $t->[3] ) if $t->[0]; # FIXME
143 ###############################################################################
145 sub generate_strings_list {
146 # Emit all extracted strings.
147 for my $t (string_list) {
148 printf $OUTPUT "%s\n", $t;
152 ###############################################################################
154 sub generate_po_file {
155 # We don't emit the Plural-Forms header; it's meaningless for us
156 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
157 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
158 # Time stamps aren't exactly right semantically. I don't know how to fix it.
159 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
160 my $time_pot = $time;
161 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
162 print $OUTPUT <<EOF;
163 # SOME DESCRIPTIVE TITLE.
164 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
165 # This file is distributed under the same license as the PACKAGE package.
166 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
169 print $OUTPUT <<EOF unless $disable_fuzzy_p;
170 #, fuzzy
172 print $OUTPUT <<EOF;
173 msgid ""
174 msgstr ""
175 "Project-Id-Version: PACKAGE VERSION\\n"
176 "POT-Creation-Date: $time_pot\\n"
177 "PO-Revision-Date: $time_po\\n"
178 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
179 "Language-Team: LANGUAGE <LL\@li.org>\\n"
180 "MIME-Version: 1.0\\n"
181 "Content-Type: text/plain; charset=$pot_charset\\n"
182 "Content-Transfer-Encoding: 8bit\\n"
185 my $directory_re = quotemeta("$directory/");
186 for my $t (string_list) {
187 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
188 my($token, $n) = ($text{$t}->[0], 0);
189 printf $OUTPUT "#. For the first occurrence,\n"
190 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
191 for my $param ($token->parameters_and_fields) {
192 $n += 1;
193 my $type = $param->type;
194 my $subtype = ($type == C4::TmplTokenType::TAG
195 && $param->string =~ /^<input\b/is?
196 $param->attributes->{'type'}->[1]: undef);
197 my $fmt = TmplTokenizer::_formalize( $param );
198 $fmt =~ s/^%/%$n\$/;
199 if ($type == C4::TmplTokenType::DIRECTIVE) {
200 # $type = "Template::Toolkit Directive";
201 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
202 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
203 $2: undef;
204 printf $OUTPUT "#. %s: %s\n", $fmt,
205 "$type" . (defined $name? " name=$name": '');
206 } else {
207 my $name = $param->attributes->{'name'};
208 my $value;
209 $value = $param->attributes->{'value'}
210 unless $subtype =~ /^(?:text)$/;
211 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
212 . (defined $name? " name=$name->[1]": '')
213 . (defined $value? " value=$value->[1]": '');
216 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
217 my($token) = ($text{$t}->[0]);
218 printf $OUTPUT "#. For the first occurrence,\n"
219 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
220 if ($token->string =~ /^<meta\b/is) {
221 my $type = $token->attributes->{'http-equiv'}->[1];
222 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
223 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
224 my $tag = uc($1);
225 my $type = (lc($tag) eq 'input'?
226 $token->attributes->{'type'}: undef);
227 my $name = $token->attributes->{'name'};
228 printf $OUTPUT "#. %s\n", $tag
229 . (defined $type? " type=$type->[1]": '')
230 . (defined $name? " name=$name->[1]": '');
232 } elsif ($text{$t}->[0]->has_js_data) {
233 printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
234 printf $OUTPUT "#. SCRIPT\n";
236 my $cformat_p;
237 for my $token (@{$text{$t}}) {
238 my $pathname = $token->pathname;
239 $pathname =~ s/^$directory_re//os;
240 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
241 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
242 if defined $pathname && defined $token->line_number;
243 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
245 printf $OUTPUT "#, c-format\n" if $cformat_p;
246 printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
247 TmplTokenizer::string_canon
248 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
249 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
250 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
254 ###############################################################################
256 sub convert_translation_file {
257 open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
258 VerboseWarnings::set_input_file_name $convert_from;
259 while (<$INPUT>) {
260 chomp;
261 my($msgid, $msgstr) = split(/\t/);
262 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
263 unless defined $msgstr;
265 # Fixup some of the bad strings
266 $msgid =~ s/^SELECTED>//;
268 # Create dummy token
269 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
270 remember( $token, $msgid );
271 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
272 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
274 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
275 my $candidate = TmplTokenizer::charset_canon $2;
276 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
277 if defined $charset_in && $charset_in ne $candidate;
278 $charset_in = $candidate;
280 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
281 my $candidate = TmplTokenizer::charset_canon $2;
282 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
283 if defined $charset_out && $charset_out ne $candidate;
284 $charset_out = $candidate;
287 # The following assumption is correct; that's what HTML::Template assumes
288 if (!defined $charset_in) {
289 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
290 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
294 ###############################################################################
296 sub usage {
297 my($exitcode) = @_;
298 my $h = $exitcode? *STDERR: *STDOUT;
299 print $h <<EOF;
300 Usage: $0 [OPTIONS]
301 Extract translatable strings from given HTML::Template input files.
303 Input file location:
304 -f, --files-from=FILE Get list of input files from FILE
305 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
307 Output file location:
308 -o, --output=FILE Write output to specified file
310 HTML::Template options:
311 -a, --extract-all Extract all strings
312 --pedantic-warnings Issue warnings even for detected problems
313 which are likely to be harmless
315 Output details:
316 -s, --sort-output generate sorted output
317 -F, --sort-by-file sort output by file location
318 -v, --verbose explain what is being done
320 Informative output:
321 --help Display this help and exit
323 Try `perldoc $0' for perhaps more information.
325 exit($exitcode);
328 ###############################################################################
330 sub usage_error {
331 print STDERR "$_[0]\n" if @_;
332 print STDERR "Try `$0 --help' for more information.\n";
333 exit(-1);
336 ###############################################################################
338 Getopt::Long::config qw( bundling no_auto_abbrev );
339 GetOptions(
340 'a|extract-all' => \$extract_all_p,
341 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
342 'convert-from=s' => \$convert_from,
343 'D|directory=s' => \$directory,
344 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
345 'f|files-from=s' => \$files_from,
346 'I|input-charset=s' => \$charset_in, # INTERNAL
347 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
348 'O|output-charset=s' => \$charset_out, # INTERNAL
349 'output|o=s' => \$output,
350 'po-mode' => \$po_mode_p, # INTERNAL
351 's|sort-output' => sub { $sort = 's' },
352 'F|sort-by-file' => sub { $sort = 'F' },
353 'v|verbose' => \$verbose_p,
354 'help' => sub { usage(0) },
355 ) || usage_error;
357 VerboseWarnings::set_application_name $0;
358 VerboseWarnings::set_pedantic_mode $pedantic_p;
360 usage_error('Missing mandatory option -f')
361 unless defined $files_from || defined $convert_from;
362 $directory = '.' unless defined $directory;
364 usage_error('You cannot specify both --convert-from and --files-from')
365 if defined $convert_from && defined $files_from;
367 if (defined $output && $output ne '-') {
368 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
369 open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
370 } else {
371 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
372 open($OUTPUT, ">&STDOUT");
375 if (defined $files_from) {
376 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
377 open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
378 while (<$INPUT>) {
379 chomp;
380 my $input = /^\//? $_: "$directory/$_";
381 my $h = TmplTokenizer->new( $input );
382 $h->set_allow_cformat( 1 );
383 VerboseWarnings::set_input_file_name $input;
384 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
385 text_extract( $h );
387 close $INPUT;
388 } else {
389 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
390 convert_translation_file;
392 generate_po_file;
394 warn "This input will not work with Mozilla standards-compliant mode\n", undef
395 if TmplTokenizer::syntaxerror_p;
398 exit(-1) if TmplTokenizer::fatal_p;
400 ###############################################################################
402 =head1 DESCRIPTION
404 This script has behaviour similar to
405 xgettext(1), and generates gettext-compatible output files.
407 A gettext-like format provides the following advantages:
409 =over
411 =item -
413 Translation to non-English-like languages with different word
414 order: gettext's c-format strings can theoretically be
415 emulated if we are able to do some analysis on the .tt input
416 and treat <TMPL_VAR> in a way similar to %s.
418 =item -
420 Context for the extracted strings: the gettext format provides
421 the filenames and line numbers where each string can be found.
422 The translator can read the source file and see the context,
423 in case the string by itself can mean several different things.
425 =item -
427 Place for the translator to add comments about the translations.
429 =item -
431 Gettext-compatible tools, if any, might be usable if we adopt
432 the gettext format.
434 =back
436 This script has already been in use for over a year and should
437 be reasonable stable. Nevertheless, it is still somewhat
438 experimental and there are still some issues.
440 Please refer to the explanation in tmpl_process3 for further
441 details.
443 If you want to generate GNOME-style POTFILES.in files, such
444 files (passed to -f) can be generated thus:
446 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
447 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
448 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
449 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
451 This is, however, quite pointless, because the "create" and
452 "update" actions have already been implemented in tmpl_process3.pl.
454 =head2 Strings inside JavaScript
456 In the SCRIPT elements, the script will attempt to scan for
457 _("I<string literal>") patterns, and extract the I<string literal>
458 as a translatable string.
460 Note that the C-like _(...) notation is required.
462 The JavaScript must actually define a _ function
463 so that the code remains correct JavaScript.
464 A suitable definition of such a function can be
466 function _(s) { return s } // dummy function for gettext
468 =head1 SEE ALSO
470 tmpl_process3.pl,
471 xgettext(1),
472 Locale::PO(3),
473 translator_doc.txt
475 =head1 BUGS
477 There probably are some. Bugs related to scanning of <INPUT>
478 tags seem to be especially likely to be present.
480 Its diagnostics are probably too verbose.
482 When a <TMPL_VAR> within a JavaScript-related attribute is
483 detected, the script currently displays no warnings at all.
484 It might be good to display some kind of warning.
486 Its sort order (-s option) seems to be different than the real
487 xgettext(1)'s sort option. This will result in translation
488 strings inside the generated PO file spuriously moving about
489 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
491 If a Javascript string has leading spaces, it will
492 generate strings with spurious leading spaces,
493 leading to failure to match the strings when actually generating
494 translated files.
496 =cut