Bug 12595: (regression tests)
[koha.git] / misc / translator / xgettext.pl
blob976ab4769c0720e150791c479f3c970d781e74ee
1 #!/usr/bin/perl
3 =head1 NAME
5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
7 =cut
9 use strict;
10 use warnings;
11 use Getopt::Long;
12 use POSIX;
13 use Locale::PO;
14 use TmplTokenizer;
15 use VerboseWarnings;
17 use vars qw( $convert_from );
18 use vars qw( $files_from $directory $output $sort );
19 use vars qw( $extract_all_p );
20 use vars qw( $pedantic_p );
21 use vars qw( %text %translation );
22 use vars qw( $charset_in $charset_out );
23 use vars qw( $disable_fuzzy_p );
24 use vars qw( $verbose_p );
25 use vars qw( $po_mode_p );
27 our $OUTPUT;
29 ###############################################################################
31 sub string_negligible_p {
32 my($t) = @_; # a string
33 # Don't emit pure whitespace, pure numbers, pure punctuation,
34 # single letters, or TMPL_VAR's.
35 # Punctuation should arguably be translated. But without context
36 # they are untranslatable. Note that $t is a string, not a token object.
37 return !$extract_all_p && (
38 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
39 || $t =~ /^\d+$/ # purely digits
40 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
41 || $t =~ /^[A-Za-z]$/ # single letters
42 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
46 sub token_negligible_p {
47 my($x) = @_;
48 my $t = $x->type;
49 return !$extract_all_p && (
50 $t == C4::TmplTokenType::TEXT? string_negligible_p( $x->string ):
51 $t == C4::TmplTokenType::DIRECTIVE? 1:
52 $t == C4::TmplTokenType::TEXT_PARAMETRIZED
53 && join( '', map { my $t = $_->type;
54 $t == C4::TmplTokenType::DIRECTIVE?
55 '1': $t == C4::TmplTokenType::TAG?
56 '': token_negligible_p( $_ )?
57 '': '1' } @{$x->children} ) eq '' );
60 ###############################################################################
62 sub remember {
63 my($token, $string) = @_;
64 # If we determine that the string is negligible, don't bother to remember
65 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
66 my $key = TmplTokenizer::string_canon( $string );
67 $text{$key} = [] unless defined $text{$key};
68 push @{$text{$key}}, $token;
72 ###############################################################################
74 sub string_list {
75 my @t = keys %text;
76 # The real gettext tools seems to sort case sensitively; I don't know why
77 @t = sort { $a cmp $b } @t if $sort eq 's';
78 @t = sort {
79 my @aa = sort { $a->pathname cmp $b->pathname
80 || $a->line_number <=> $b->line_number } @{$text{$a}};
81 my @bb = sort { $a->pathname cmp $b->pathname
82 || $a->line_number <=> $b->line_number } @{$text{$b}};
83 $aa[0]->pathname cmp $bb[0]->pathname
84 || $aa[0]->line_number <=> $bb[0]->line_number;
85 } @t if $sort eq 'F';
86 return @t;
89 ###############################################################################
91 sub text_extract {
92 my($h) = @_;
93 for (;;) {
94 my $s = TmplTokenizer::next_token $h;
95 last unless defined $s;
96 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
97 if ($kind eq C4::TmplTokenType::TEXT) {
98 if ($t =~ /\S/s && $t !~ /<!/){
99 remember( $s, $t );
101 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
102 if ($s->form =~ /\S/s && $s->form !~ /<!/){
103 remember( $s, $s->form );
105 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
106 # value [tag=input], meta
107 my $tag;
108 $tag = lc($1) if $t =~ /^<(\S+)/s;
109 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
110 if ($attr->{$a}) {
111 next if $a eq 'label' && $tag ne 'optgroup';
112 next if $a eq 'content' && $tag ne 'meta';
113 next if $a eq 'value' && ($tag ne 'input'
114 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
115 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
116 $val = TmplTokenizer::trim $val;
117 remember( $s, $val ) if $val =~ /\S/s;
120 } elsif ($s->has_js_data) {
121 for my $t (@{$s->js_data}) {
122 remember( $s, $t->[3] ) if $t->[0]; # FIXME
128 ###############################################################################
130 sub generate_strings_list {
131 # Emit all extracted strings.
132 for my $t (string_list) {
133 printf $OUTPUT "%s\n", $t;
137 ###############################################################################
139 sub generate_po_file {
140 # We don't emit the Plural-Forms header; it's meaningless for us
141 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
142 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
143 # Time stamps aren't exactly right semantically. I don't know how to fix it.
144 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
145 my $time_pot = $time;
146 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
147 print $OUTPUT <<EOF;
148 # SOME DESCRIPTIVE TITLE.
149 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
150 # This file is distributed under the same license as the PACKAGE package.
151 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
154 print $OUTPUT <<EOF unless $disable_fuzzy_p;
155 #, fuzzy
157 print $OUTPUT <<EOF;
158 msgid ""
159 msgstr ""
160 "Project-Id-Version: PACKAGE VERSION\\n"
161 "POT-Creation-Date: $time_pot\\n"
162 "PO-Revision-Date: $time_po\\n"
163 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
164 "Language-Team: LANGUAGE <LL\@li.org>\\n"
165 "MIME-Version: 1.0\\n"
166 "Content-Type: text/plain; charset=$pot_charset\\n"
167 "Content-Transfer-Encoding: 8bit\\n"
170 my $directory_re = quotemeta("$directory/");
171 for my $t (string_list) {
172 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
173 my($token, $n) = ($text{$t}->[0], 0);
174 printf $OUTPUT "#. For the first occurrence,\n"
175 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
176 for my $param ($token->parameters_and_fields) {
177 $n += 1;
178 my $type = $param->type;
179 my $subtype = ($type == C4::TmplTokenType::TAG
180 && $param->string =~ /^<input\b/is?
181 $param->attributes->{'type'}->[1]: undef);
182 my $fmt = TmplTokenizer::_formalize( $param );
183 $fmt =~ s/^%/%$n\$/;
184 if ($type == C4::TmplTokenType::DIRECTIVE) {
185 # $type = "Template::Toolkit Directive";
186 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
187 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
188 $2: undef;
189 printf $OUTPUT "#. %s: %s\n", $fmt,
190 "$type" . (defined $name? " name=$name": '');
191 } else {
192 my $name = $param->attributes->{'name'};
193 my $value;
194 $value = $param->attributes->{'value'}
195 unless $subtype =~ /^(?:text)$/;
196 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
197 . (defined $name? " name=$name->[1]": '')
198 . (defined $value? " value=$value->[1]": '');
201 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
202 my($token) = ($text{$t}->[0]);
203 printf $OUTPUT "#. For the first occurrence,\n"
204 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
205 if ($token->string =~ /^<meta\b/is) {
206 my $type = $token->attributes->{'http-equiv'}->[1];
207 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
208 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
209 my $tag = uc($1);
210 my $type = (lc($tag) eq 'input'?
211 $token->attributes->{'type'}: undef);
212 my $name = $token->attributes->{'name'};
213 printf $OUTPUT "#. %s\n", $tag
214 . (defined $type? " type=$type->[1]": '')
215 . (defined $name? " name=$name->[1]": '');
217 } elsif ($text{$t}->[0]->has_js_data) {
218 printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
219 printf $OUTPUT "#. SCRIPT\n";
221 my $cformat_p;
222 for my $token (@{$text{$t}}) {
223 my $pathname = $token->pathname;
224 $pathname =~ s/^$directory_re//os;
225 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
226 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
227 if defined $pathname && defined $token->line_number;
228 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
230 printf $OUTPUT "#, c-format\n" if $cformat_p;
231 printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
232 TmplTokenizer::string_canon
233 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
234 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
235 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
239 ###############################################################################
241 sub convert_translation_file {
242 open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
243 VerboseWarnings::set_input_file_name $convert_from;
244 while (<$INPUT>) {
245 chomp;
246 my($msgid, $msgstr) = split(/\t/);
247 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
248 unless defined $msgstr;
250 # Fixup some of the bad strings
251 $msgid =~ s/^SELECTED>//;
253 # Create dummy token
254 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
255 remember( $token, $msgid );
256 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
257 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
259 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
260 my $candidate = TmplTokenizer::charset_canon $2;
261 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
262 if defined $charset_in && $charset_in ne $candidate;
263 $charset_in = $candidate;
265 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
266 my $candidate = TmplTokenizer::charset_canon $2;
267 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
268 if defined $charset_out && $charset_out ne $candidate;
269 $charset_out = $candidate;
272 # The following assumption is correct; that's what HTML::Template assumes
273 if (!defined $charset_in) {
274 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
275 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
279 ###############################################################################
281 sub usage {
282 my($exitcode) = @_;
283 my $h = $exitcode? *STDERR: *STDOUT;
284 print $h <<EOF;
285 Usage: $0 [OPTIONS]
286 Extract translatable strings from given HTML::Template input files.
288 Input file location:
289 -f, --files-from=FILE Get list of input files from FILE
290 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
292 Output file location:
293 -o, --output=FILE Write output to specified file
295 HTML::Template options:
296 -a, --extract-all Extract all strings
297 --pedantic-warnings Issue warnings even for detected problems
298 which are likely to be harmless
300 Output details:
301 -s, --sort-output generate sorted output
302 -F, --sort-by-file sort output by file location
303 -v, --verbose explain what is being done
305 Informative output:
306 --help Display this help and exit
308 Try `perldoc $0' for perhaps more information.
310 exit($exitcode);
313 ###############################################################################
315 sub usage_error {
316 print STDERR "$_[0]\n" if @_;
317 print STDERR "Try `$0 --help' for more information.\n";
318 exit(-1);
321 ###############################################################################
323 Getopt::Long::config qw( bundling no_auto_abbrev );
324 GetOptions(
325 'a|extract-all' => \$extract_all_p,
326 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
327 'convert-from=s' => \$convert_from,
328 'D|directory=s' => \$directory,
329 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
330 'f|files-from=s' => \$files_from,
331 'I|input-charset=s' => \$charset_in, # INTERNAL
332 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
333 'O|output-charset=s' => \$charset_out, # INTERNAL
334 'output|o=s' => \$output,
335 'po-mode' => \$po_mode_p, # INTERNAL
336 's|sort-output' => sub { $sort = 's' },
337 'F|sort-by-file' => sub { $sort = 'F' },
338 'v|verbose' => \$verbose_p,
339 'help' => sub { usage(0) },
340 ) || usage_error;
342 VerboseWarnings::set_application_name $0;
343 VerboseWarnings::set_pedantic_mode $pedantic_p;
345 usage_error('Missing mandatory option -f')
346 unless defined $files_from || defined $convert_from;
347 $directory = '.' unless defined $directory;
349 usage_error('You cannot specify both --convert-from and --files-from')
350 if defined $convert_from && defined $files_from;
352 if (defined $output && $output ne '-') {
353 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
354 open($OUTPUT, '>', $output) || die "$output: $!\n";
355 } else {
356 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
357 open($OUTPUT, ">&STDOUT");
360 if (defined $files_from) {
361 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
362 open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
363 while (<$INPUT>) {
364 chomp;
365 my $input = /^\//? $_: "$directory/$_";
366 my $h = TmplTokenizer->new( $input );
367 $h->set_allow_cformat( 1 );
368 VerboseWarnings::set_input_file_name $input;
369 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
370 text_extract( $h );
372 close $INPUT;
373 } else {
374 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
375 convert_translation_file;
377 generate_po_file;
379 warn "This input will not work with Mozilla standards-compliant mode\n", undef
380 if TmplTokenizer::syntaxerror_p;
383 exit(-1) if TmplTokenizer::fatal_p;
385 ###############################################################################
387 =head1 DESCRIPTION
389 This is an experimental script based on the modularized
390 text-extract2.pl script. It has behaviour similar to
391 xgettext(1), and generates gettext-compatible output files.
393 A gettext-like format provides the following advantages:
395 =over
397 =item -
399 Translation to non-English-like languages with different word
400 order: gettext's c-format strings can theoretically be
401 emulated if we are able to do some analysis on the .tt input
402 and treat <TMPL_VAR> in a way similar to %s.
404 =item -
406 Context for the extracted strings: the gettext format provides
407 the filenames and line numbers where each string can be found.
408 The translator can read the source file and see the context,
409 in case the string by itself can mean several different things.
411 =item -
413 Place for the translator to add comments about the translations.
415 =item -
417 Gettext-compatible tools, if any, might be usable if we adopt
418 the gettext format.
420 =back
422 This script has already been in use for over a year and should
423 be reasonable stable. Nevertheless, it is still somewhat
424 experimental and there are still some issues.
426 Please refer to the explanation in tmpl_process3 for further
427 details.
429 If you want to generate GNOME-style POTFILES.in files, such
430 files (passed to -f) can be generated thus:
432 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
433 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
434 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
435 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
437 This is, however, quite pointless, because the "create" and
438 "update" actions have already been implemented in tmpl_process3.pl.
440 =head2 Strings inside JavaScript
442 In the SCRIPT elements, the script will attempt to scan for
443 _("I<string literal>") patterns, and extract the I<string literal>
444 as a translatable string.
446 Note that the C-like _(...) notation is required.
448 The JavaScript must actually define a _ function
449 so that the code remains correct JavaScript.
450 A suitable definition of such a function can be
452 function _(s) { return s } // dummy function for gettext
454 =head1 SEE ALSO
456 tmpl_process3.pl,
457 xgettext(1),
458 Locale::PO(3),
459 translator_doc.txt
461 =head1 BUGS
463 There probably are some. Bugs related to scanning of <INPUT>
464 tags seem to be especially likely to be present.
466 Its diagnostics are probably too verbose.
468 When a <TMPL_VAR> within a JavaScript-related attribute is
469 detected, the script currently displays no warnings at all.
470 It might be good to display some kind of warning.
472 Its sort order (-s option) seems to be different than the real
473 xgettext(1)'s sort option. This will result in translation
474 strings inside the generated PO file spuriously moving about
475 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
477 If a Javascript string has leading spaces, it will
478 generate strings with spurious leading spaces,
479 leading to failure to match the strings when actually generating
480 translated files.
482 =cut