Bug 25922: Fix typo 'arial-label' -> 'aria-label'
[koha.git] / misc / translator / xgettext.pl
blobf9ba3bf8d029a7cdee5d42ddba2a8b32e8bea694
1 #!/usr/bin/perl
3 =head1 NAME
5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
7 =cut
9 use FindBin;
10 use lib $FindBin::Bin;
12 use strict;
13 use warnings;
14 use Getopt::Long;
15 use POSIX;
16 use Locale::PO;
17 use TmplTokenizer;
18 use VerboseWarnings;
20 use vars qw( $convert_from );
21 use vars qw( $files_from $directory $output $sort );
22 use vars qw( $extract_all_p );
23 use vars qw( $pedantic_p );
24 use vars qw( %text %translation );
25 use vars qw( $charset_in $charset_out );
26 use vars qw( $disable_fuzzy_p );
27 use vars qw( $verbose_p );
28 use vars qw( $po_mode_p );
30 our $OUTPUT;
32 ###############################################################################
34 sub string_negligible_p {
35 my($t) = @_; # a string
36 # Don't emit pure whitespace, pure numbers, pure punctuation,
37 # single letters, or TMPL_VAR's.
38 # Punctuation should arguably be translated. But without context
39 # they are untranslatable. Note that $t is a string, not a token object.
40 return !$extract_all_p && (
41 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
42 || $t =~ /^\d+$/ # purely digits
43 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
44 || $t =~ /^[A-Za-z]$/ # single letters
45 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
46 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
47 || $t =~ /^\s*<\?.*\?>/ # ignore xml prolog
51 sub token_negligible_p {
52 my ($x) = @_;
53 my $t = $x->type;
54 return !$extract_all_p && (
55 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
56 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
57 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
58 && join(
59 '',
60 map {
61 my $t = $_->type;
62 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
63 : $t == C4::TmplTokenType::TAG() ? ''
64 : token_negligible_p($_) ? ''
65 : '1'
66 } @{ $x->children }
67 ) eq ''
71 ###############################################################################
73 sub remember {
74 my($token, $string) = @_;
75 # If we determine that the string is negligible, don't bother to remember
76 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
77 my $key = TmplTokenizer::string_canon( $string );
78 $text{$key} = [] unless defined $text{$key};
79 push @{$text{$key}}, $token;
83 ###############################################################################
85 sub string_list {
86 my @t = keys %text;
87 # The real gettext tools seems to sort case sensitively; I don't know why
88 @t = sort { $a cmp $b } @t if $sort eq 's';
89 @t = sort {
90 my @aa = sort { $a->pathname cmp $b->pathname
91 || $a->line_number <=> $b->line_number } @{$text{$a}};
92 my @bb = sort { $a->pathname cmp $b->pathname
93 || $a->line_number <=> $b->line_number } @{$text{$b}};
94 $aa[0]->pathname cmp $bb[0]->pathname
95 || $aa[0]->line_number <=> $bb[0]->line_number;
96 } @t if $sort eq 'F';
97 return @t;
100 ###############################################################################
102 sub text_extract {
103 my($h) = @_;
104 for (;;) {
105 my $s = TmplTokenizer::next_token($h);
106 last unless defined $s;
107 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
108 if ($kind eq C4::TmplTokenType::TEXT) {
109 if ($t =~ /\S/s && $t !~ /<!/){
110 remember( $s, $t );
112 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
113 if ($s->form =~ /\S/s && $s->form !~ /<!/){
114 remember( $s, $s->form );
116 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
117 # value [tag=input], meta
118 my $tag;
119 $tag = lc($1) if $t =~ /^<(\S+)/s;
120 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
121 if ($attr->{$a}) {
122 next if $a eq 'label' && $tag ne 'optgroup';
123 next if $a eq 'content' && $tag ne 'meta';
124 next if $a eq 'value' && ($tag ne 'input'
125 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
126 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
127 $val = TmplTokenizer::trim($val);
128 # for selected attributes replace '[%..%]' with '%s' globally
129 if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
130 $val =~ s/\[\%.*?\%\]/\%s/g;
132 # save attribute text for translation
133 remember( $s, $val ) if $val =~ /\S/s;
136 } elsif ($s->has_js_data) {
137 for my $t (@{$s->js_data}) {
138 remember( $s, $t->[3] ) if $t->[0]; # FIXME
144 ###############################################################################
146 sub generate_strings_list {
147 # Emit all extracted strings.
148 for my $t (string_list) {
149 printf $OUTPUT "%s\n", $t;
153 ###############################################################################
155 sub generate_po_file {
156 # We don't emit the Plural-Forms header; it's meaningless for us
157 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
158 $pot_charset = TmplTokenizer::charset_canon($pot_charset);
159 # Time stamps aren't exactly right semantically. I don't know how to fix it.
160 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
161 my $time_pot = $time;
162 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
163 print $OUTPUT <<EOF;
164 # SOME DESCRIPTIVE TITLE.
165 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
166 # This file is distributed under the same license as the PACKAGE package.
167 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
170 print $OUTPUT <<EOF unless $disable_fuzzy_p;
171 #, fuzzy
173 print $OUTPUT <<EOF;
174 msgid ""
175 msgstr ""
176 "Project-Id-Version: PACKAGE VERSION\\n"
177 "POT-Creation-Date: $time_pot\\n"
178 "PO-Revision-Date: $time_po\\n"
179 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
180 "Language-Team: LANGUAGE <LL\@li.org>\\n"
181 "MIME-Version: 1.0\\n"
182 "Content-Type: text/plain; charset=$pot_charset\\n"
183 "Content-Transfer-Encoding: 8bit\\n"
186 my $directory_re = quotemeta("$directory/");
187 for my $t (string_list) {
188 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
189 my($token, $n) = ($text{$t}->[0], 0);
190 printf $OUTPUT "#. For the first occurrence,\n"
191 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
192 for my $param ($token->parameters_and_fields) {
193 $n += 1;
194 my $type = $param->type;
195 my $subtype = ($type == C4::TmplTokenType::TAG
196 && $param->string =~ /^<input\b/is?
197 $param->attributes->{'type'}->[1]: undef);
198 my $fmt = TmplTokenizer::_formalize( $param );
199 $fmt =~ s/^%/%$n\$/;
200 if ($type == C4::TmplTokenType::DIRECTIVE) {
201 # $type = "Template::Toolkit Directive";
202 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
203 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
204 $2: undef;
205 printf $OUTPUT "#. %s: %s\n", $fmt,
206 "$type" . (defined $name? " name=$name": '');
207 } else {
208 my $name = $param->attributes->{'name'};
209 my $value;
210 $value = $param->attributes->{'value'}
211 unless $subtype =~ /^(?:text)$/;
212 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
213 . (defined $name? " name=$name->[1]": '')
214 . (defined $value? " value=$value->[1]": '');
217 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
218 my($token) = ($text{$t}->[0]);
219 printf $OUTPUT "#. For the first occurrence,\n"
220 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
221 if ($token->string =~ /^<meta\b/is) {
222 my $type = $token->attributes->{'http-equiv'}->[1];
223 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
224 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
225 my $tag = uc($1);
226 my $type = (lc($tag) eq 'input'?
227 $token->attributes->{'type'}: undef);
228 my $name = $token->attributes->{'name'};
229 printf $OUTPUT "#. %s\n", $tag
230 . (defined $type? " type=$type->[1]": '')
231 . (defined $name? " name=$name->[1]": '');
233 } elsif ($text{$t}->[0]->has_js_data) {
234 printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
235 printf $OUTPUT "#. SCRIPT\n";
237 my $cformat_p;
238 for my $token (@{$text{$t}}) {
239 my $pathname = $token->pathname;
240 $pathname =~ s/^$directory_re//os;
241 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
242 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
243 if defined $pathname && defined $token->line_number;
244 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
246 printf $OUTPUT "#, c-format\n" if $cformat_p;
247 printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po(
248 TmplTokenizer::string_canon(
249 TmplTokenizer::charset_convert($t, $charset_in, $charset_out)
252 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
253 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
257 ###############################################################################
259 sub convert_translation_file {
260 open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
261 VerboseWarnings::set_input_file_name($convert_from);
262 while (<$INPUT>) {
263 chomp;
264 my($msgid, $msgstr) = split(/\t/);
265 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
266 unless defined $msgstr;
268 # Fixup some of the bad strings
269 $msgid =~ s/^SELECTED>//;
271 # Create dummy token
272 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
273 remember( $token, $msgid );
274 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
275 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
277 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
278 my $candidate = TmplTokenizer::charset_canon($2);
279 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
280 if defined $charset_in && $charset_in ne $candidate;
281 $charset_in = $candidate;
283 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
284 my $candidate = TmplTokenizer::charset_canon($2);
285 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
286 if defined $charset_out && $charset_out ne $candidate;
287 $charset_out = $candidate;
290 # The following assumption is correct; that's what HTML::Template assumes
291 if (!defined $charset_in) {
292 $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
293 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
297 ###############################################################################
299 sub usage {
300 my($exitcode) = @_;
301 my $h = $exitcode? *STDERR: *STDOUT;
302 print $h <<EOF;
303 Usage: $0 [OPTIONS]
304 Extract translatable strings from given HTML::Template input files.
306 Input file location:
307 -f, --files-from=FILE Get list of input files from FILE
308 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
310 Output file location:
311 -o, --output=FILE Write output to specified file
313 HTML::Template options:
314 -a, --extract-all Extract all strings
315 --pedantic-warnings Issue warnings even for detected problems
316 which are likely to be harmless
318 Output details:
319 -s, --sort-output generate sorted output
320 -F, --sort-by-file sort output by file location
321 -v, --verbose explain what is being done
323 Informative output:
324 --help Display this help and exit
326 Try `perldoc $0' for perhaps more information.
328 exit($exitcode);
331 ###############################################################################
333 sub usage_error {
334 print STDERR "$_[0]\n" if @_;
335 print STDERR "Try `$0 --help' for more information.\n";
336 exit(-1);
339 ###############################################################################
341 Getopt::Long::config qw( bundling no_auto_abbrev );
342 GetOptions(
343 'a|extract-all' => \$extract_all_p,
344 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
345 'convert-from=s' => \$convert_from,
346 'D|directory=s' => \$directory,
347 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
348 'f|files-from=s' => \$files_from,
349 'I|input-charset=s' => \$charset_in, # INTERNAL
350 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
351 'O|output-charset=s' => \$charset_out, # INTERNAL
352 'output|o=s' => \$output,
353 'po-mode' => \$po_mode_p, # INTERNAL
354 's|sort-output' => sub { $sort = 's' },
355 'F|sort-by-file' => sub { $sort = 'F' },
356 'v|verbose' => \$verbose_p,
357 'help' => sub { usage(0) },
358 ) || usage_error;
360 VerboseWarnings::set_application_name($0);
361 VerboseWarnings::set_pedantic_mode($pedantic_p);
363 usage_error('Missing mandatory option -f')
364 unless defined $files_from || defined $convert_from;
365 $directory = '.' unless defined $directory;
367 usage_error('You cannot specify both --convert-from and --files-from')
368 if defined $convert_from && defined $files_from;
370 if (defined $output && $output ne '-') {
371 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
372 open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
373 } else {
374 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
375 open($OUTPUT, ">&STDOUT");
378 if (defined $files_from) {
379 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
380 open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
381 while (<$INPUT>) {
382 chomp;
383 my $input = /^\//? $_: "$directory/$_";
384 my $h = TmplTokenizer->new( $input );
385 $h->set_allow_cformat( 1 );
386 VerboseWarnings::set_input_file_name($input);
387 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
388 text_extract( $h );
390 close $INPUT;
391 } else {
392 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
393 convert_translation_file;
395 generate_po_file;
397 warn "This input will not work with Mozilla standards-compliant mode\n", undef
398 if TmplTokenizer::syntaxerror_p;
401 exit(-1) if TmplTokenizer::fatal_p;
403 ###############################################################################
405 =head1 DESCRIPTION
407 This script has behaviour similar to
408 xgettext(1), and generates gettext-compatible output files.
410 A gettext-like format provides the following advantages:
412 =over
414 =item -
416 Translation to non-English-like languages with different word
417 order: gettext's c-format strings can theoretically be
418 emulated if we are able to do some analysis on the .tt input
419 and treat <TMPL_VAR> in a way similar to %s.
421 =item -
423 Context for the extracted strings: the gettext format provides
424 the filenames and line numbers where each string can be found.
425 The translator can read the source file and see the context,
426 in case the string by itself can mean several different things.
428 =item -
430 Place for the translator to add comments about the translations.
432 =item -
434 Gettext-compatible tools, if any, might be usable if we adopt
435 the gettext format.
437 =back
439 This script has already been in use for over a year and should
440 be reasonable stable. Nevertheless, it is still somewhat
441 experimental and there are still some issues.
443 Please refer to the explanation in tmpl_process3 for further
444 details.
446 If you want to generate GNOME-style POTFILES.in files, such
447 files (passed to -f) can be generated thus:
449 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
450 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
451 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
452 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
454 This is, however, quite pointless, because the "create" and
455 "update" actions have already been implemented in tmpl_process3.pl.
457 =head2 Strings inside JavaScript
459 In the SCRIPT elements, the script will attempt to scan for
460 _("I<string literal>") patterns, and extract the I<string literal>
461 as a translatable string.
463 Note that the C-like _(...) notation is required.
465 The JavaScript must actually define a _ function
466 so that the code remains correct JavaScript.
467 A suitable definition of such a function can be
469 function _(s) { return s } // dummy function for gettext
471 =head1 SEE ALSO
473 tmpl_process3.pl,
474 xgettext(1),
475 Locale::PO(3),
476 translator_doc.txt
478 =head1 BUGS
480 There probably are some. Bugs related to scanning of <INPUT>
481 tags seem to be especially likely to be present.
483 Its diagnostics are probably too verbose.
485 When a <TMPL_VAR> within a JavaScript-related attribute is
486 detected, the script currently displays no warnings at all.
487 It might be good to display some kind of warning.
489 Its sort order (-s option) seems to be different than the real
490 xgettext(1)'s sort option. This will result in translation
491 strings inside the generated PO file spuriously moving about
492 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
494 If a Javascript string has leading spaces, it will
495 generate strings with spurious leading spaces,
496 leading to failure to match the strings when actually generating
497 translated files.
499 =cut