Translation updates for Koha 3.22.0-beta release
[koha.git] / misc / translator / xgettext.pl
blob504dc9f389ee39aa515175eb7ce212cf3d891bcc
1 #!/usr/bin/perl
3 =head1 NAME
5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
7 =cut
9 use strict;
10 use warnings;
11 use Getopt::Long;
12 use POSIX;
13 use Locale::PO;
14 use TmplTokenizer;
15 use VerboseWarnings;
17 use vars qw( $convert_from );
18 use vars qw( $files_from $directory $output $sort );
19 use vars qw( $extract_all_p );
20 use vars qw( $pedantic_p );
21 use vars qw( %text %translation );
22 use vars qw( $charset_in $charset_out );
23 use vars qw( $disable_fuzzy_p );
24 use vars qw( $verbose_p );
25 use vars qw( $po_mode_p );
27 our $OUTPUT;
29 ###############################################################################
31 sub string_negligible_p {
32 my($t) = @_; # a string
33 # Don't emit pure whitespace, pure numbers, pure punctuation,
34 # single letters, or TMPL_VAR's.
35 # Punctuation should arguably be translated. But without context
36 # they are untranslatable. Note that $t is a string, not a token object.
37 return !$extract_all_p && (
38 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
39 || $t =~ /^\d+$/ # purely digits
40 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
41 || $t =~ /^[A-Za-z]$/ # single letters
42 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
43 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
47 sub token_negligible_p {
48 my($x) = @_;
49 my $t = $x->type;
50 return !$extract_all_p && (
51 $t == C4::TmplTokenType::TEXT? string_negligible_p( $x->string ):
52 $t == C4::TmplTokenType::DIRECTIVE? 1:
53 $t == C4::TmplTokenType::TEXT_PARAMETRIZED
54 && join( '', map { my $t = $_->type;
55 $t == C4::TmplTokenType::DIRECTIVE?
56 '1': $t == C4::TmplTokenType::TAG?
57 '': token_negligible_p( $_ )?
58 '': '1' } @{$x->children} ) eq '' );
61 ###############################################################################
63 sub remember {
64 my($token, $string) = @_;
65 # If we determine that the string is negligible, don't bother to remember
66 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
67 my $key = TmplTokenizer::string_canon( $string );
68 $text{$key} = [] unless defined $text{$key};
69 push @{$text{$key}}, $token;
73 ###############################################################################
75 sub string_list {
76 my @t = keys %text;
77 # The real gettext tools seems to sort case sensitively; I don't know why
78 @t = sort { $a cmp $b } @t if $sort eq 's';
79 @t = sort {
80 my @aa = sort { $a->pathname cmp $b->pathname
81 || $a->line_number <=> $b->line_number } @{$text{$a}};
82 my @bb = sort { $a->pathname cmp $b->pathname
83 || $a->line_number <=> $b->line_number } @{$text{$b}};
84 $aa[0]->pathname cmp $bb[0]->pathname
85 || $aa[0]->line_number <=> $bb[0]->line_number;
86 } @t if $sort eq 'F';
87 return @t;
90 ###############################################################################
92 sub text_extract {
93 my($h) = @_;
94 for (;;) {
95 my $s = TmplTokenizer::next_token $h;
96 last unless defined $s;
97 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
98 if ($kind eq C4::TmplTokenType::TEXT) {
99 if ($t =~ /\S/s && $t !~ /<!/){
100 remember( $s, $t );
102 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
103 if ($s->form =~ /\S/s && $s->form !~ /<!/){
104 remember( $s, $s->form );
106 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
107 # value [tag=input], meta
108 my $tag;
109 $tag = lc($1) if $t =~ /^<(\S+)/s;
110 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
111 if ($attr->{$a}) {
112 next if $a eq 'label' && $tag ne 'optgroup';
113 next if $a eq 'content' && $tag ne 'meta';
114 next if $a eq 'value' && ($tag ne 'input'
115 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
116 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
117 $val = TmplTokenizer::trim $val;
118 remember( $s, $val ) if $val =~ /\S/s;
121 } elsif ($s->has_js_data) {
122 for my $t (@{$s->js_data}) {
123 remember( $s, $t->[3] ) if $t->[0]; # FIXME
129 ###############################################################################
131 sub generate_strings_list {
132 # Emit all extracted strings.
133 for my $t (string_list) {
134 printf $OUTPUT "%s\n", $t;
138 ###############################################################################
140 sub generate_po_file {
141 # We don't emit the Plural-Forms header; it's meaningless for us
142 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
143 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
144 # Time stamps aren't exactly right semantically. I don't know how to fix it.
145 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
146 my $time_pot = $time;
147 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
148 print $OUTPUT <<EOF;
149 # SOME DESCRIPTIVE TITLE.
150 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
151 # This file is distributed under the same license as the PACKAGE package.
152 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
155 print $OUTPUT <<EOF unless $disable_fuzzy_p;
156 #, fuzzy
158 print $OUTPUT <<EOF;
159 msgid ""
160 msgstr ""
161 "Project-Id-Version: PACKAGE VERSION\\n"
162 "POT-Creation-Date: $time_pot\\n"
163 "PO-Revision-Date: $time_po\\n"
164 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
165 "Language-Team: LANGUAGE <LL\@li.org>\\n"
166 "MIME-Version: 1.0\\n"
167 "Content-Type: text/plain; charset=$pot_charset\\n"
168 "Content-Transfer-Encoding: 8bit\\n"
171 my $directory_re = quotemeta("$directory/");
172 for my $t (string_list) {
173 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
174 my($token, $n) = ($text{$t}->[0], 0);
175 printf $OUTPUT "#. For the first occurrence,\n"
176 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
177 for my $param ($token->parameters_and_fields) {
178 $n += 1;
179 my $type = $param->type;
180 my $subtype = ($type == C4::TmplTokenType::TAG
181 && $param->string =~ /^<input\b/is?
182 $param->attributes->{'type'}->[1]: undef);
183 my $fmt = TmplTokenizer::_formalize( $param );
184 $fmt =~ s/^%/%$n\$/;
185 if ($type == C4::TmplTokenType::DIRECTIVE) {
186 # $type = "Template::Toolkit Directive";
187 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
188 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
189 $2: undef;
190 printf $OUTPUT "#. %s: %s\n", $fmt,
191 "$type" . (defined $name? " name=$name": '');
192 } else {
193 my $name = $param->attributes->{'name'};
194 my $value;
195 $value = $param->attributes->{'value'}
196 unless $subtype =~ /^(?:text)$/;
197 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
198 . (defined $name? " name=$name->[1]": '')
199 . (defined $value? " value=$value->[1]": '');
202 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
203 my($token) = ($text{$t}->[0]);
204 printf $OUTPUT "#. For the first occurrence,\n"
205 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
206 if ($token->string =~ /^<meta\b/is) {
207 my $type = $token->attributes->{'http-equiv'}->[1];
208 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
209 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
210 my $tag = uc($1);
211 my $type = (lc($tag) eq 'input'?
212 $token->attributes->{'type'}: undef);
213 my $name = $token->attributes->{'name'};
214 printf $OUTPUT "#. %s\n", $tag
215 . (defined $type? " type=$type->[1]": '')
216 . (defined $name? " name=$name->[1]": '');
218 } elsif ($text{$t}->[0]->has_js_data) {
219 printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
220 printf $OUTPUT "#. SCRIPT\n";
222 my $cformat_p;
223 for my $token (@{$text{$t}}) {
224 my $pathname = $token->pathname;
225 $pathname =~ s/^$directory_re//os;
226 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
227 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
228 if defined $pathname && defined $token->line_number;
229 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
231 printf $OUTPUT "#, c-format\n" if $cformat_p;
232 printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
233 TmplTokenizer::string_canon
234 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
235 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
236 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
240 ###############################################################################
242 sub convert_translation_file {
243 open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
244 VerboseWarnings::set_input_file_name $convert_from;
245 while (<$INPUT>) {
246 chomp;
247 my($msgid, $msgstr) = split(/\t/);
248 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
249 unless defined $msgstr;
251 # Fixup some of the bad strings
252 $msgid =~ s/^SELECTED>//;
254 # Create dummy token
255 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
256 remember( $token, $msgid );
257 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
258 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
260 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
261 my $candidate = TmplTokenizer::charset_canon $2;
262 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
263 if defined $charset_in && $charset_in ne $candidate;
264 $charset_in = $candidate;
266 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
267 my $candidate = TmplTokenizer::charset_canon $2;
268 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
269 if defined $charset_out && $charset_out ne $candidate;
270 $charset_out = $candidate;
273 # The following assumption is correct; that's what HTML::Template assumes
274 if (!defined $charset_in) {
275 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
276 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
280 ###############################################################################
282 sub usage {
283 my($exitcode) = @_;
284 my $h = $exitcode? *STDERR: *STDOUT;
285 print $h <<EOF;
286 Usage: $0 [OPTIONS]
287 Extract translatable strings from given HTML::Template input files.
289 Input file location:
290 -f, --files-from=FILE Get list of input files from FILE
291 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
293 Output file location:
294 -o, --output=FILE Write output to specified file
296 HTML::Template options:
297 -a, --extract-all Extract all strings
298 --pedantic-warnings Issue warnings even for detected problems
299 which are likely to be harmless
301 Output details:
302 -s, --sort-output generate sorted output
303 -F, --sort-by-file sort output by file location
304 -v, --verbose explain what is being done
306 Informative output:
307 --help Display this help and exit
309 Try `perldoc $0' for perhaps more information.
311 exit($exitcode);
314 ###############################################################################
316 sub usage_error {
317 print STDERR "$_[0]\n" if @_;
318 print STDERR "Try `$0 --help' for more information.\n";
319 exit(-1);
322 ###############################################################################
324 Getopt::Long::config qw( bundling no_auto_abbrev );
325 GetOptions(
326 'a|extract-all' => \$extract_all_p,
327 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
328 'convert-from=s' => \$convert_from,
329 'D|directory=s' => \$directory,
330 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
331 'f|files-from=s' => \$files_from,
332 'I|input-charset=s' => \$charset_in, # INTERNAL
333 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
334 'O|output-charset=s' => \$charset_out, # INTERNAL
335 'output|o=s' => \$output,
336 'po-mode' => \$po_mode_p, # INTERNAL
337 's|sort-output' => sub { $sort = 's' },
338 'F|sort-by-file' => sub { $sort = 'F' },
339 'v|verbose' => \$verbose_p,
340 'help' => sub { usage(0) },
341 ) || usage_error;
343 VerboseWarnings::set_application_name $0;
344 VerboseWarnings::set_pedantic_mode $pedantic_p;
346 usage_error('Missing mandatory option -f')
347 unless defined $files_from || defined $convert_from;
348 $directory = '.' unless defined $directory;
350 usage_error('You cannot specify both --convert-from and --files-from')
351 if defined $convert_from && defined $files_from;
353 if (defined $output && $output ne '-') {
354 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
355 open($OUTPUT, '>', $output) || die "$output: $!\n";
356 } else {
357 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
358 open($OUTPUT, ">&STDOUT");
361 if (defined $files_from) {
362 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
363 open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
364 while (<$INPUT>) {
365 chomp;
366 my $input = /^\//? $_: "$directory/$_";
367 my $h = TmplTokenizer->new( $input );
368 $h->set_allow_cformat( 1 );
369 VerboseWarnings::set_input_file_name $input;
370 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
371 text_extract( $h );
373 close $INPUT;
374 } else {
375 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
376 convert_translation_file;
378 generate_po_file;
380 warn "This input will not work with Mozilla standards-compliant mode\n", undef
381 if TmplTokenizer::syntaxerror_p;
384 exit(-1) if TmplTokenizer::fatal_p;
386 ###############################################################################
388 =head1 DESCRIPTION
390 This is an experimental script based on the modularized
391 text-extract2.pl script. It has behaviour similar to
392 xgettext(1), and generates gettext-compatible output files.
394 A gettext-like format provides the following advantages:
396 =over
398 =item -
400 Translation to non-English-like languages with different word
401 order: gettext's c-format strings can theoretically be
402 emulated if we are able to do some analysis on the .tt input
403 and treat <TMPL_VAR> in a way similar to %s.
405 =item -
407 Context for the extracted strings: the gettext format provides
408 the filenames and line numbers where each string can be found.
409 The translator can read the source file and see the context,
410 in case the string by itself can mean several different things.
412 =item -
414 Place for the translator to add comments about the translations.
416 =item -
418 Gettext-compatible tools, if any, might be usable if we adopt
419 the gettext format.
421 =back
423 This script has already been in use for over a year and should
424 be reasonable stable. Nevertheless, it is still somewhat
425 experimental and there are still some issues.
427 Please refer to the explanation in tmpl_process3 for further
428 details.
430 If you want to generate GNOME-style POTFILES.in files, such
431 files (passed to -f) can be generated thus:
433 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
434 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
435 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
436 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
438 This is, however, quite pointless, because the "create" and
439 "update" actions have already been implemented in tmpl_process3.pl.
441 =head2 Strings inside JavaScript
443 In the SCRIPT elements, the script will attempt to scan for
444 _("I<string literal>") patterns, and extract the I<string literal>
445 as a translatable string.
447 Note that the C-like _(...) notation is required.
449 The JavaScript must actually define a _ function
450 so that the code remains correct JavaScript.
451 A suitable definition of such a function can be
453 function _(s) { return s } // dummy function for gettext
455 =head1 SEE ALSO
457 tmpl_process3.pl,
458 xgettext(1),
459 Locale::PO(3),
460 translator_doc.txt
462 =head1 BUGS
464 There probably are some. Bugs related to scanning of <INPUT>
465 tags seem to be especially likely to be present.
467 Its diagnostics are probably too verbose.
469 When a <TMPL_VAR> within a JavaScript-related attribute is
470 detected, the script currently displays no warnings at all.
471 It might be good to display some kind of warning.
473 Its sort order (-s option) seems to be different than the real
474 xgettext(1)'s sort option. This will result in translation
475 strings inside the generated PO file spuriously moving about
476 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
478 If a Javascript string has leading spaces, it will
479 generate strings with spurious leading spaces,
480 leading to failure to match the strings when actually generating
481 translated files.
483 =cut