Bug 7607: (follow-up) Address OPAC and limits
[koha.git] / misc / translator / xgettext.pl
blobe04c5298634791f818c5f42539e30a2d4e4786d4
1 #!/usr/bin/perl
3 # This file is part of Koha.
5 # Koha is free software; you can redistribute it and/or modify it
6 # under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 3 of the License, or
8 # (at your option) any later version.
10 # Koha is distributed in the hope that it will be useful, but
11 # WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with Koha; if not, see <http://www.gnu.org/licenses>.
18 =head1 NAME
20 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
22 =cut
24 use FindBin;
25 use lib $FindBin::Bin;
27 use strict;
28 use warnings;
29 use Getopt::Long;
30 use POSIX;
31 use Locale::PO;
32 use TmplTokenizer;
33 use VerboseWarnings;
35 use vars qw( $convert_from );
36 use vars qw( $files_from $directory $output $sort );
37 use vars qw( $extract_all_p );
38 use vars qw( $pedantic_p );
39 use vars qw( %text %translation );
40 use vars qw( $charset_in $charset_out );
41 use vars qw( $disable_fuzzy_p );
42 use vars qw( $verbose_p );
43 use vars qw( $po_mode_p );
45 our $OUTPUT;
47 ###############################################################################
49 sub string_negligible_p {
50 my($t) = @_; # a string
51 # Don't emit pure whitespace, pure numbers, pure punctuation,
52 # single letters, or TMPL_VAR's.
53 # Punctuation should arguably be translated. But without context
54 # they are untranslatable. Note that $t is a string, not a token object.
55 return !$extract_all_p && (
56 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
57 || $t =~ /^\d+$/ # purely digits
58 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
59 || $t =~ /^[A-Za-z]$/ # single letters
60 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
61 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
62 || $t =~ /^\s*<\?.*\?>/ # ignore xml prolog
66 sub token_negligible_p {
67 my ($x) = @_;
68 my $t = $x->type;
69 return !$extract_all_p && (
70 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
71 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
72 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
73 && join(
74 '',
75 map {
76 my $t = $_->type;
77 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
78 : $t == C4::TmplTokenType::TAG() ? ''
79 : token_negligible_p($_) ? ''
80 : '1'
81 } @{ $x->children }
82 ) eq ''
86 ###############################################################################
88 sub remember {
89 my($token, $string) = @_;
90 # If we determine that the string is negligible, don't bother to remember
91 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
92 my $key = TmplTokenizer::string_canon( $string );
93 $text{$key} = [] unless defined $text{$key};
94 push @{$text{$key}}, $token;
98 ###############################################################################
100 sub string_list {
101 my @t = keys %text;
102 # The real gettext tools seems to sort case sensitively; I don't know why
103 @t = sort { $a cmp $b } @t if $sort eq 's';
104 @t = sort {
105 my @aa = sort { $a->pathname cmp $b->pathname
106 || $a->line_number <=> $b->line_number } @{$text{$a}};
107 my @bb = sort { $a->pathname cmp $b->pathname
108 || $a->line_number <=> $b->line_number } @{$text{$b}};
109 $aa[0]->pathname cmp $bb[0]->pathname
110 || $aa[0]->line_number <=> $bb[0]->line_number;
111 } @t if $sort eq 'F';
112 return @t;
115 ###############################################################################
117 sub text_extract {
118 my($h) = @_;
119 for (;;) {
120 my $s = TmplTokenizer::next_token($h);
121 last unless defined $s;
122 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
123 if ($kind eq C4::TmplTokenType::TEXT) {
124 if ($t =~ /\S/s && $t !~ /<!/){
125 remember( $s, $t );
127 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
128 if ($s->form =~ /\S/s && $s->form !~ /<!/){
129 remember( $s, $s->form );
131 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
132 # value [tag=input], meta
133 my $tag;
134 $tag = lc($1) if $t =~ /^<(\S+)/s;
135 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
136 if ($attr->{$a}) {
137 next if $a eq 'label' && $tag ne 'optgroup';
138 next if $a eq 'content' && $tag ne 'meta';
139 next if $a eq 'value' && ($tag ne 'input'
140 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
141 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
142 $val = TmplTokenizer::trim($val);
143 # for selected attributes replace '[%..%]' with '%s' globally
144 if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
145 $val =~ s/\[\%.*?\%\]/\%s/g;
147 # save attribute text for translation
148 remember( $s, $val ) if $val =~ /\S/s;
151 } elsif ($s->has_js_data) {
152 for my $t (@{$s->js_data}) {
153 remember( $s, $t->[3] ) if $t->[0]; # FIXME
159 ###############################################################################
161 sub generate_strings_list {
162 # Emit all extracted strings.
163 for my $t (string_list) {
164 printf $OUTPUT "%s\n", $t;
168 ###############################################################################
170 sub generate_po_file {
171 # We don't emit the Plural-Forms header; it's meaningless for us
172 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
173 $pot_charset = TmplTokenizer::charset_canon($pot_charset);
174 # Time stamps aren't exactly right semantically. I don't know how to fix it.
175 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
176 my $time_pot = $time;
177 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
178 print $OUTPUT <<EOF;
179 # SOME DESCRIPTIVE TITLE.
180 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
181 # This file is distributed under the same license as the PACKAGE package.
182 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
185 print $OUTPUT <<EOF unless $disable_fuzzy_p;
186 #, fuzzy
188 print $OUTPUT <<EOF;
189 msgid ""
190 msgstr ""
191 "Project-Id-Version: Koha\\n"
192 "POT-Creation-Date: $time_pot\\n"
193 "PO-Revision-Date: $time_po\\n"
194 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
195 "Language-Team: LANGUAGE <LL\@li.org>\\n"
196 "MIME-Version: 1.0\\n"
197 "Content-Type: text/plain; charset=$pot_charset\\n"
198 "Content-Transfer-Encoding: 8bit\\n"
201 my $directory_re = quotemeta("$directory/");
202 for my $t (string_list) {
203 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
204 my($token, $n) = ($text{$t}->[0], 0);
205 printf $OUTPUT "#. For the first occurrence,\n"
206 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
207 for my $param ($token->parameters_and_fields) {
208 $n += 1;
209 my $type = $param->type;
210 my $subtype = ($type == C4::TmplTokenType::TAG
211 && $param->string =~ /^<input\b/is?
212 $param->attributes->{'type'}->[1]: undef);
213 my $fmt = TmplTokenizer::_formalize( $param );
214 $fmt =~ s/^%/%$n\$/;
215 if ($type == C4::TmplTokenType::DIRECTIVE) {
216 # $type = "Template::Toolkit Directive";
217 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
218 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
219 $2: undef;
220 printf $OUTPUT "#. %s: %s\n", $fmt,
221 "$type" . (defined $name? " name=$name": '');
222 } else {
223 my $name = $param->attributes->{'name'};
224 my $value;
225 $value = $param->attributes->{'value'}
226 unless $subtype =~ /^(?:text)$/;
227 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
228 . (defined $name? " name=$name->[1]": '')
229 . (defined $value? " value=$value->[1]": '');
232 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
233 my($token) = ($text{$t}->[0]);
234 printf $OUTPUT "#. For the first occurrence,\n"
235 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
236 if ($token->string =~ /^<meta\b/is) {
237 my $type = $token->attributes->{'http-equiv'}->[1];
238 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
239 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
240 my $tag = uc($1);
241 my $type = (lc($tag) eq 'input'?
242 $token->attributes->{'type'}: undef);
243 my $name = $token->attributes->{'name'};
244 printf $OUTPUT "#. %s\n", $tag
245 . (defined $type? " type=$type->[1]": '')
246 . (defined $name? " name=$name->[1]": '');
248 } elsif ($text{$t}->[0]->has_js_data) {
249 printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
250 printf $OUTPUT "#. SCRIPT\n";
252 my $cformat_p;
253 for my $token (@{$text{$t}}) {
254 my $pathname = $token->pathname;
255 $pathname =~ s/^$directory_re//os;
256 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
257 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
258 if defined $pathname && defined $token->line_number;
259 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
261 printf $OUTPUT "#, c-format\n" if $cformat_p;
262 printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po(
263 TmplTokenizer::string_canon(
264 TmplTokenizer::charset_convert($t, $charset_in, $charset_out)
267 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
268 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
272 ###############################################################################
274 sub convert_translation_file {
275 open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
276 VerboseWarnings::set_input_file_name($convert_from);
277 while (<$INPUT>) {
278 chomp;
279 my($msgid, $msgstr) = split(/\t/);
280 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
281 unless defined $msgstr;
283 # Fixup some of the bad strings
284 $msgid =~ s/^SELECTED>//;
286 # Create dummy token
287 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
288 remember( $token, $msgid );
289 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
290 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
292 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
293 my $candidate = TmplTokenizer::charset_canon($2);
294 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
295 if defined $charset_in && $charset_in ne $candidate;
296 $charset_in = $candidate;
298 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
299 my $candidate = TmplTokenizer::charset_canon($2);
300 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
301 if defined $charset_out && $charset_out ne $candidate;
302 $charset_out = $candidate;
305 # The following assumption is correct; that's what HTML::Template assumes
306 if (!defined $charset_in) {
307 $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
308 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
312 ###############################################################################
314 sub usage {
315 my($exitcode) = @_;
316 my $h = $exitcode? *STDERR: *STDOUT;
317 print $h <<EOF;
318 Usage: $0 [OPTIONS]
319 Extract translatable strings from given HTML::Template input files.
321 Input file location:
322 -f, --files-from=FILE Get list of input files from FILE
323 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
325 Output file location:
326 -o, --output=FILE Write output to specified file
328 HTML::Template options:
329 -a, --extract-all Extract all strings
330 --pedantic-warnings Issue warnings even for detected problems
331 which are likely to be harmless
333 Output details:
334 -s, --sort-output generate sorted output
335 -F, --sort-by-file sort output by file location
336 -v, --verbose explain what is being done
338 Informative output:
339 --help Display this help and exit
341 Try `perldoc $0' for perhaps more information.
343 exit($exitcode);
346 ###############################################################################
348 sub usage_error {
349 print STDERR "$_[0]\n" if @_;
350 print STDERR "Try `$0 --help' for more information.\n";
351 exit(-1);
354 ###############################################################################
356 Getopt::Long::config qw( bundling no_auto_abbrev );
357 GetOptions(
358 'a|extract-all' => \$extract_all_p,
359 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
360 'convert-from=s' => \$convert_from,
361 'D|directory=s' => \$directory,
362 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
363 'f|files-from=s' => \$files_from,
364 'I|input-charset=s' => \$charset_in, # INTERNAL
365 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
366 'O|output-charset=s' => \$charset_out, # INTERNAL
367 'output|o=s' => \$output,
368 'po-mode' => \$po_mode_p, # INTERNAL
369 's|sort-output' => sub { $sort = 's' },
370 'F|sort-by-file' => sub { $sort = 'F' },
371 'v|verbose' => \$verbose_p,
372 'help' => sub { usage(0) },
373 ) || usage_error;
375 VerboseWarnings::set_application_name($0);
376 VerboseWarnings::set_pedantic_mode($pedantic_p);
378 usage_error('Missing mandatory option -f')
379 unless defined $files_from || defined $convert_from;
380 $directory = '.' unless defined $directory;
382 usage_error('You cannot specify both --convert-from and --files-from')
383 if defined $convert_from && defined $files_from;
385 if (defined $output && $output ne '-') {
386 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
387 open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
388 } else {
389 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
390 open($OUTPUT, ">&STDOUT");
393 if (defined $files_from) {
394 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
395 open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
396 while (<$INPUT>) {
397 chomp;
398 my $input = /^\//? $_: "$directory/$_";
399 my $h = TmplTokenizer->new( $input );
400 $h->set_allow_cformat( 1 );
401 VerboseWarnings::set_input_file_name($input);
402 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
403 text_extract( $h );
405 close $INPUT;
406 } else {
407 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
408 convert_translation_file;
410 generate_po_file;
412 warn "This input will not work with Mozilla standards-compliant mode\n", undef
413 if TmplTokenizer::syntaxerror_p;
416 exit(-1) if TmplTokenizer::fatal_p;
418 ###############################################################################
420 =head1 DESCRIPTION
422 This script has behaviour similar to
423 xgettext(1), and generates gettext-compatible output files.
425 A gettext-like format provides the following advantages:
427 =over
429 =item -
431 Translation to non-English-like languages with different word
432 order: gettext's c-format strings can theoretically be
433 emulated if we are able to do some analysis on the .tt input
434 and treat <TMPL_VAR> in a way similar to %s.
436 =item -
438 Context for the extracted strings: the gettext format provides
439 the filenames and line numbers where each string can be found.
440 The translator can read the source file and see the context,
441 in case the string by itself can mean several different things.
443 =item -
445 Place for the translator to add comments about the translations.
447 =item -
449 Gettext-compatible tools, if any, might be usable if we adopt
450 the gettext format.
452 =back
454 This script has already been in use for over a year and should
455 be reasonable stable. Nevertheless, it is still somewhat
456 experimental and there are still some issues.
458 Please refer to the explanation in tmpl_process3 for further
459 details.
461 If you want to generate GNOME-style POTFILES.in files, such
462 files (passed to -f) can be generated thus:
464 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
465 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
466 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
467 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
469 This is, however, quite pointless, because the "create" and
470 "update" actions have already been implemented in tmpl_process3.pl.
472 =head2 Strings inside JavaScript
474 In the SCRIPT elements, the script will attempt to scan for
475 _("I<string literal>") patterns, and extract the I<string literal>
476 as a translatable string.
478 Note that the C-like _(...) notation is required.
480 The JavaScript must actually define a _ function
481 so that the code remains correct JavaScript.
482 A suitable definition of such a function can be
484 function _(s) { return s } // dummy function for gettext
486 =head1 SEE ALSO
488 tmpl_process3.pl,
489 xgettext(1),
490 Locale::PO(3),
491 translator_doc.txt
493 =head1 BUGS
495 There probably are some. Bugs related to scanning of <INPUT>
496 tags seem to be especially likely to be present.
498 Its diagnostics are probably too verbose.
500 When a <TMPL_VAR> within a JavaScript-related attribute is
501 detected, the script currently displays no warnings at all.
502 It might be good to display some kind of warning.
504 Its sort order (-s option) seems to be different than the real
505 xgettext(1)'s sort option. This will result in translation
506 strings inside the generated PO file spuriously moving about
507 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
509 If a Javascript string has leading spaces, it will
510 generate strings with spurious leading spaces,
511 leading to failure to match the strings when actually generating
512 translated files.
514 =cut