Bug 22495: Restore Elasticsearch mapping for su-geo
[koha.git] / misc / translator / xgettext.pl
blob0343ed29c23103a9cdaff9e538ae8395b04eb23e
1 #!/usr/bin/perl
3 =head1 NAME
5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
7 =cut
9 use FindBin;
10 use lib $FindBin::Bin;
12 use strict;
13 use warnings;
14 use Getopt::Long;
15 use POSIX;
16 use Locale::PO;
17 use TmplTokenizer;
18 use VerboseWarnings;
20 use vars qw( $convert_from );
21 use vars qw( $files_from $directory $output $sort );
22 use vars qw( $extract_all_p );
23 use vars qw( $pedantic_p );
24 use vars qw( %text %translation );
25 use vars qw( $charset_in $charset_out );
26 use vars qw( $disable_fuzzy_p );
27 use vars qw( $verbose_p );
28 use vars qw( $po_mode_p );
30 our $OUTPUT;
32 ###############################################################################
34 sub string_negligible_p {
35 my($t) = @_; # a string
36 # Don't emit pure whitespace, pure numbers, pure punctuation,
37 # single letters, or TMPL_VAR's.
38 # Punctuation should arguably be translated. But without context
39 # they are untranslatable. Note that $t is a string, not a token object.
40 return !$extract_all_p && (
41 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
42 || $t =~ /^\d+$/ # purely digits
43 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
44 || $t =~ /^[A-Za-z]$/ # single letters
45 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
46 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
50 sub token_negligible_p {
51 my ($x) = @_;
52 my $t = $x->type;
53 return !$extract_all_p && (
54 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
55 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
56 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
57 && join(
58 '',
59 map {
60 my $t = $_->type;
61 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
62 : $t == C4::TmplTokenType::TAG() ? ''
63 : token_negligible_p($_) ? ''
64 : '1'
65 } @{ $x->children }
66 ) eq ''
70 ###############################################################################
72 sub remember {
73 my($token, $string) = @_;
74 # If we determine that the string is negligible, don't bother to remember
75 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
76 my $key = TmplTokenizer::string_canon( $string );
77 $text{$key} = [] unless defined $text{$key};
78 push @{$text{$key}}, $token;
82 ###############################################################################
84 sub string_list {
85 my @t = keys %text;
86 # The real gettext tools seems to sort case sensitively; I don't know why
87 @t = sort { $a cmp $b } @t if $sort eq 's';
88 @t = sort {
89 my @aa = sort { $a->pathname cmp $b->pathname
90 || $a->line_number <=> $b->line_number } @{$text{$a}};
91 my @bb = sort { $a->pathname cmp $b->pathname
92 || $a->line_number <=> $b->line_number } @{$text{$b}};
93 $aa[0]->pathname cmp $bb[0]->pathname
94 || $aa[0]->line_number <=> $bb[0]->line_number;
95 } @t if $sort eq 'F';
96 return @t;
99 ###############################################################################
101 sub text_extract {
102 my($h) = @_;
103 for (;;) {
104 my $s = TmplTokenizer::next_token $h;
105 last unless defined $s;
106 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
107 if ($kind eq C4::TmplTokenType::TEXT) {
108 if ($t =~ /\S/s && $t !~ /<!/){
109 remember( $s, $t );
111 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
112 if ($s->form =~ /\S/s && $s->form !~ /<!/){
113 remember( $s, $s->form );
115 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
116 # value [tag=input], meta
117 my $tag;
118 $tag = lc($1) if $t =~ /^<(\S+)/s;
119 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
120 if ($attr->{$a}) {
121 next if $a eq 'label' && $tag ne 'optgroup';
122 next if $a eq 'content' && $tag ne 'meta';
123 next if $a eq 'value' && ($tag ne 'input'
124 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
125 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
126 $val = TmplTokenizer::trim $val;
127 remember( $s, $val ) if $val =~ /\S/s;
130 } elsif ($s->has_js_data) {
131 for my $t (@{$s->js_data}) {
132 remember( $s, $t->[3] ) if $t->[0]; # FIXME
138 ###############################################################################
140 sub generate_strings_list {
141 # Emit all extracted strings.
142 for my $t (string_list) {
143 printf $OUTPUT "%s\n", $t;
147 ###############################################################################
149 sub generate_po_file {
150 # We don't emit the Plural-Forms header; it's meaningless for us
151 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
152 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
153 # Time stamps aren't exactly right semantically. I don't know how to fix it.
154 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
155 my $time_pot = $time;
156 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
157 print $OUTPUT <<EOF;
158 # SOME DESCRIPTIVE TITLE.
159 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
160 # This file is distributed under the same license as the PACKAGE package.
161 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
164 print $OUTPUT <<EOF unless $disable_fuzzy_p;
165 #, fuzzy
167 print $OUTPUT <<EOF;
168 msgid ""
169 msgstr ""
170 "Project-Id-Version: PACKAGE VERSION\\n"
171 "POT-Creation-Date: $time_pot\\n"
172 "PO-Revision-Date: $time_po\\n"
173 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
174 "Language-Team: LANGUAGE <LL\@li.org>\\n"
175 "MIME-Version: 1.0\\n"
176 "Content-Type: text/plain; charset=$pot_charset\\n"
177 "Content-Transfer-Encoding: 8bit\\n"
180 my $directory_re = quotemeta("$directory/");
181 for my $t (string_list) {
182 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
183 my($token, $n) = ($text{$t}->[0], 0);
184 printf $OUTPUT "#. For the first occurrence,\n"
185 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
186 for my $param ($token->parameters_and_fields) {
187 $n += 1;
188 my $type = $param->type;
189 my $subtype = ($type == C4::TmplTokenType::TAG
190 && $param->string =~ /^<input\b/is?
191 $param->attributes->{'type'}->[1]: undef);
192 my $fmt = TmplTokenizer::_formalize( $param );
193 $fmt =~ s/^%/%$n\$/;
194 if ($type == C4::TmplTokenType::DIRECTIVE) {
195 # $type = "Template::Toolkit Directive";
196 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
197 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
198 $2: undef;
199 printf $OUTPUT "#. %s: %s\n", $fmt,
200 "$type" . (defined $name? " name=$name": '');
201 } else {
202 my $name = $param->attributes->{'name'};
203 my $value;
204 $value = $param->attributes->{'value'}
205 unless $subtype =~ /^(?:text)$/;
206 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
207 . (defined $name? " name=$name->[1]": '')
208 . (defined $value? " value=$value->[1]": '');
211 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
212 my($token) = ($text{$t}->[0]);
213 printf $OUTPUT "#. For the first occurrence,\n"
214 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
215 if ($token->string =~ /^<meta\b/is) {
216 my $type = $token->attributes->{'http-equiv'}->[1];
217 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
218 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
219 my $tag = uc($1);
220 my $type = (lc($tag) eq 'input'?
221 $token->attributes->{'type'}: undef);
222 my $name = $token->attributes->{'name'};
223 printf $OUTPUT "#. %s\n", $tag
224 . (defined $type? " type=$type->[1]": '')
225 . (defined $name? " name=$name->[1]": '');
227 } elsif ($text{$t}->[0]->has_js_data) {
228 printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
229 printf $OUTPUT "#. SCRIPT\n";
231 my $cformat_p;
232 for my $token (@{$text{$t}}) {
233 my $pathname = $token->pathname;
234 $pathname =~ s/^$directory_re//os;
235 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
236 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
237 if defined $pathname && defined $token->line_number;
238 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
240 printf $OUTPUT "#, c-format\n" if $cformat_p;
241 printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
242 TmplTokenizer::string_canon
243 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
244 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
245 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
249 ###############################################################################
251 sub convert_translation_file {
252 open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
253 VerboseWarnings::set_input_file_name $convert_from;
254 while (<$INPUT>) {
255 chomp;
256 my($msgid, $msgstr) = split(/\t/);
257 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
258 unless defined $msgstr;
260 # Fixup some of the bad strings
261 $msgid =~ s/^SELECTED>//;
263 # Create dummy token
264 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
265 remember( $token, $msgid );
266 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
267 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
269 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
270 my $candidate = TmplTokenizer::charset_canon $2;
271 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
272 if defined $charset_in && $charset_in ne $candidate;
273 $charset_in = $candidate;
275 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
276 my $candidate = TmplTokenizer::charset_canon $2;
277 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
278 if defined $charset_out && $charset_out ne $candidate;
279 $charset_out = $candidate;
282 # The following assumption is correct; that's what HTML::Template assumes
283 if (!defined $charset_in) {
284 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
285 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
289 ###############################################################################
291 sub usage {
292 my($exitcode) = @_;
293 my $h = $exitcode? *STDERR: *STDOUT;
294 print $h <<EOF;
295 Usage: $0 [OPTIONS]
296 Extract translatable strings from given HTML::Template input files.
298 Input file location:
299 -f, --files-from=FILE Get list of input files from FILE
300 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
302 Output file location:
303 -o, --output=FILE Write output to specified file
305 HTML::Template options:
306 -a, --extract-all Extract all strings
307 --pedantic-warnings Issue warnings even for detected problems
308 which are likely to be harmless
310 Output details:
311 -s, --sort-output generate sorted output
312 -F, --sort-by-file sort output by file location
313 -v, --verbose explain what is being done
315 Informative output:
316 --help Display this help and exit
318 Try `perldoc $0' for perhaps more information.
320 exit($exitcode);
323 ###############################################################################
325 sub usage_error {
326 print STDERR "$_[0]\n" if @_;
327 print STDERR "Try `$0 --help' for more information.\n";
328 exit(-1);
331 ###############################################################################
333 Getopt::Long::config qw( bundling no_auto_abbrev );
334 GetOptions(
335 'a|extract-all' => \$extract_all_p,
336 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
337 'convert-from=s' => \$convert_from,
338 'D|directory=s' => \$directory,
339 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
340 'f|files-from=s' => \$files_from,
341 'I|input-charset=s' => \$charset_in, # INTERNAL
342 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
343 'O|output-charset=s' => \$charset_out, # INTERNAL
344 'output|o=s' => \$output,
345 'po-mode' => \$po_mode_p, # INTERNAL
346 's|sort-output' => sub { $sort = 's' },
347 'F|sort-by-file' => sub { $sort = 'F' },
348 'v|verbose' => \$verbose_p,
349 'help' => sub { usage(0) },
350 ) || usage_error;
352 VerboseWarnings::set_application_name $0;
353 VerboseWarnings::set_pedantic_mode $pedantic_p;
355 usage_error('Missing mandatory option -f')
356 unless defined $files_from || defined $convert_from;
357 $directory = '.' unless defined $directory;
359 usage_error('You cannot specify both --convert-from and --files-from')
360 if defined $convert_from && defined $files_from;
362 if (defined $output && $output ne '-') {
363 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
364 open($OUTPUT, '>', $output) || die "$output: $!\n";
365 } else {
366 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
367 open($OUTPUT, ">&STDOUT");
370 if (defined $files_from) {
371 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
372 open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
373 while (<$INPUT>) {
374 chomp;
375 my $input = /^\//? $_: "$directory/$_";
376 my $h = TmplTokenizer->new( $input );
377 $h->set_allow_cformat( 1 );
378 VerboseWarnings::set_input_file_name $input;
379 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
380 text_extract( $h );
382 close $INPUT;
383 } else {
384 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
385 convert_translation_file;
387 generate_po_file;
389 warn "This input will not work with Mozilla standards-compliant mode\n", undef
390 if TmplTokenizer::syntaxerror_p;
393 exit(-1) if TmplTokenizer::fatal_p;
395 ###############################################################################
397 =head1 DESCRIPTION
399 This script has behaviour similar to
400 xgettext(1), and generates gettext-compatible output files.
402 A gettext-like format provides the following advantages:
404 =over
406 =item -
408 Translation to non-English-like languages with different word
409 order: gettext's c-format strings can theoretically be
410 emulated if we are able to do some analysis on the .tt input
411 and treat <TMPL_VAR> in a way similar to %s.
413 =item -
415 Context for the extracted strings: the gettext format provides
416 the filenames and line numbers where each string can be found.
417 The translator can read the source file and see the context,
418 in case the string by itself can mean several different things.
420 =item -
422 Place for the translator to add comments about the translations.
424 =item -
426 Gettext-compatible tools, if any, might be usable if we adopt
427 the gettext format.
429 =back
431 This script has already been in use for over a year and should
432 be reasonable stable. Nevertheless, it is still somewhat
433 experimental and there are still some issues.
435 Please refer to the explanation in tmpl_process3 for further
436 details.
438 If you want to generate GNOME-style POTFILES.in files, such
439 files (passed to -f) can be generated thus:
441 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
442 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
443 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
444 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
446 This is, however, quite pointless, because the "create" and
447 "update" actions have already been implemented in tmpl_process3.pl.
449 =head2 Strings inside JavaScript
451 In the SCRIPT elements, the script will attempt to scan for
452 _("I<string literal>") patterns, and extract the I<string literal>
453 as a translatable string.
455 Note that the C-like _(...) notation is required.
457 The JavaScript must actually define a _ function
458 so that the code remains correct JavaScript.
459 A suitable definition of such a function can be
461 function _(s) { return s } // dummy function for gettext
463 =head1 SEE ALSO
465 tmpl_process3.pl,
466 xgettext(1),
467 Locale::PO(3),
468 translator_doc.txt
470 =head1 BUGS
472 There probably are some. Bugs related to scanning of <INPUT>
473 tags seem to be especially likely to be present.
475 Its diagnostics are probably too verbose.
477 When a <TMPL_VAR> within a JavaScript-related attribute is
478 detected, the script currently displays no warnings at all.
479 It might be good to display some kind of warning.
481 Its sort order (-s option) seems to be different than the real
482 xgettext(1)'s sort option. This will result in translation
483 strings inside the generated PO file spuriously moving about
484 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
486 If a Javascript string has leading spaces, it will
487 generate strings with spurious leading spaces,
488 leading to failure to match the strings when actually generating
489 translated files.
491 =cut