Bug 17660: #adv is considered as an ad by adblock
[koha.git] / misc / translator / xgettext.pl
blob8d555478ad7c92821400de6e2909e4e4cbc1aa2e
1 #!/usr/bin/perl
3 =head1 NAME
5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
7 =cut
9 use strict;
10 use warnings;
11 use Getopt::Long;
12 use POSIX;
13 use Locale::PO;
14 use TmplTokenizer;
15 use VerboseWarnings;
17 use vars qw( $convert_from );
18 use vars qw( $files_from $directory $output $sort );
19 use vars qw( $extract_all_p );
20 use vars qw( $pedantic_p );
21 use vars qw( %text %translation );
22 use vars qw( $charset_in $charset_out );
23 use vars qw( $disable_fuzzy_p );
24 use vars qw( $verbose_p );
25 use vars qw( $po_mode_p );
27 our $OUTPUT;
29 ###############################################################################
31 sub string_negligible_p {
32 my($t) = @_; # a string
33 # Don't emit pure whitespace, pure numbers, pure punctuation,
34 # single letters, or TMPL_VAR's.
35 # Punctuation should arguably be translated. But without context
36 # they are untranslatable. Note that $t is a string, not a token object.
37 return !$extract_all_p && (
38 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
39 || $t =~ /^\d+$/ # purely digits
40 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
41 || $t =~ /^[A-Za-z]$/ # single letters
42 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
43 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
47 sub token_negligible_p {
48 my ($x) = @_;
49 my $t = $x->type;
50 return !$extract_all_p && (
51 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
52 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
53 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
54 && join(
55 '',
56 map {
57 my $t = $_->type;
58 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
59 : $t == C4::TmplTokenType::TAG() ? ''
60 : token_negligible_p($_) ? ''
61 : '1'
62 } @{ $x->children }
63 ) eq ''
67 ###############################################################################
69 sub remember {
70 my($token, $string) = @_;
71 # If we determine that the string is negligible, don't bother to remember
72 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
73 my $key = TmplTokenizer::string_canon( $string );
74 $text{$key} = [] unless defined $text{$key};
75 push @{$text{$key}}, $token;
79 ###############################################################################
81 sub string_list {
82 my @t = keys %text;
83 # The real gettext tools seems to sort case sensitively; I don't know why
84 @t = sort { $a cmp $b } @t if $sort eq 's';
85 @t = sort {
86 my @aa = sort { $a->pathname cmp $b->pathname
87 || $a->line_number <=> $b->line_number } @{$text{$a}};
88 my @bb = sort { $a->pathname cmp $b->pathname
89 || $a->line_number <=> $b->line_number } @{$text{$b}};
90 $aa[0]->pathname cmp $bb[0]->pathname
91 || $aa[0]->line_number <=> $bb[0]->line_number;
92 } @t if $sort eq 'F';
93 return @t;
96 ###############################################################################
98 sub text_extract {
99 my($h) = @_;
100 for (;;) {
101 my $s = TmplTokenizer::next_token $h;
102 last unless defined $s;
103 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
104 if ($kind eq C4::TmplTokenType::TEXT) {
105 if ($t =~ /\S/s && $t !~ /<!/){
106 remember( $s, $t );
108 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
109 if ($s->form =~ /\S/s && $s->form !~ /<!/){
110 remember( $s, $s->form );
112 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
113 # value [tag=input], meta
114 my $tag;
115 $tag = lc($1) if $t =~ /^<(\S+)/s;
116 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
117 if ($attr->{$a}) {
118 next if $a eq 'label' && $tag ne 'optgroup';
119 next if $a eq 'content' && $tag ne 'meta';
120 next if $a eq 'value' && ($tag ne 'input'
121 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
122 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
123 $val = TmplTokenizer::trim $val;
124 remember( $s, $val ) if $val =~ /\S/s;
127 } elsif ($s->has_js_data) {
128 for my $t (@{$s->js_data}) {
129 remember( $s, $t->[3] ) if $t->[0]; # FIXME
135 ###############################################################################
137 sub generate_strings_list {
138 # Emit all extracted strings.
139 for my $t (string_list) {
140 printf $OUTPUT "%s\n", $t;
144 ###############################################################################
146 sub generate_po_file {
147 # We don't emit the Plural-Forms header; it's meaningless for us
148 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
149 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
150 # Time stamps aren't exactly right semantically. I don't know how to fix it.
151 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
152 my $time_pot = $time;
153 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
154 print $OUTPUT <<EOF;
155 # SOME DESCRIPTIVE TITLE.
156 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
157 # This file is distributed under the same license as the PACKAGE package.
158 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
161 print $OUTPUT <<EOF unless $disable_fuzzy_p;
162 #, fuzzy
164 print $OUTPUT <<EOF;
165 msgid ""
166 msgstr ""
167 "Project-Id-Version: PACKAGE VERSION\\n"
168 "POT-Creation-Date: $time_pot\\n"
169 "PO-Revision-Date: $time_po\\n"
170 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
171 "Language-Team: LANGUAGE <LL\@li.org>\\n"
172 "MIME-Version: 1.0\\n"
173 "Content-Type: text/plain; charset=$pot_charset\\n"
174 "Content-Transfer-Encoding: 8bit\\n"
177 my $directory_re = quotemeta("$directory/");
178 for my $t (string_list) {
179 if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
180 my($token, $n) = ($text{$t}->[0], 0);
181 printf $OUTPUT "#. For the first occurrence,\n"
182 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
183 for my $param ($token->parameters_and_fields) {
184 $n += 1;
185 my $type = $param->type;
186 my $subtype = ($type == C4::TmplTokenType::TAG
187 && $param->string =~ /^<input\b/is?
188 $param->attributes->{'type'}->[1]: undef);
189 my $fmt = TmplTokenizer::_formalize( $param );
190 $fmt =~ s/^%/%$n\$/;
191 if ($type == C4::TmplTokenType::DIRECTIVE) {
192 # $type = "Template::Toolkit Directive";
193 $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
194 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
195 $2: undef;
196 printf $OUTPUT "#. %s: %s\n", $fmt,
197 "$type" . (defined $name? " name=$name": '');
198 } else {
199 my $name = $param->attributes->{'name'};
200 my $value;
201 $value = $param->attributes->{'value'}
202 unless $subtype =~ /^(?:text)$/;
203 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
204 . (defined $name? " name=$name->[1]": '')
205 . (defined $value? " value=$value->[1]": '');
208 } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
209 my($token) = ($text{$t}->[0]);
210 printf $OUTPUT "#. For the first occurrence,\n"
211 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
212 if ($token->string =~ /^<meta\b/is) {
213 my $type = $token->attributes->{'http-equiv'}->[1];
214 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
215 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
216 my $tag = uc($1);
217 my $type = (lc($tag) eq 'input'?
218 $token->attributes->{'type'}: undef);
219 my $name = $token->attributes->{'name'};
220 printf $OUTPUT "#. %s\n", $tag
221 . (defined $type? " type=$type->[1]": '')
222 . (defined $name? " name=$name->[1]": '');
224 } elsif ($text{$t}->[0]->has_js_data) {
225 printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
226 printf $OUTPUT "#. SCRIPT\n";
228 my $cformat_p;
229 for my $token (@{$text{$t}}) {
230 my $pathname = $token->pathname;
231 $pathname =~ s/^$directory_re//os;
232 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
233 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
234 if defined $pathname && defined $token->line_number;
235 $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
237 printf $OUTPUT "#, c-format\n" if $cformat_p;
238 printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
239 TmplTokenizer::string_canon
240 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
241 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
242 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
246 ###############################################################################
248 sub convert_translation_file {
249 open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
250 VerboseWarnings::set_input_file_name $convert_from;
251 while (<$INPUT>) {
252 chomp;
253 my($msgid, $msgstr) = split(/\t/);
254 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
255 unless defined $msgstr;
257 # Fixup some of the bad strings
258 $msgid =~ s/^SELECTED>//;
260 # Create dummy token
261 my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
262 remember( $token, $msgid );
263 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
264 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
266 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
267 my $candidate = TmplTokenizer::charset_canon $2;
268 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
269 if defined $charset_in && $charset_in ne $candidate;
270 $charset_in = $candidate;
272 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
273 my $candidate = TmplTokenizer::charset_canon $2;
274 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
275 if defined $charset_out && $charset_out ne $candidate;
276 $charset_out = $candidate;
279 # The following assumption is correct; that's what HTML::Template assumes
280 if (!defined $charset_in) {
281 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
282 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
286 ###############################################################################
288 sub usage {
289 my($exitcode) = @_;
290 my $h = $exitcode? *STDERR: *STDOUT;
291 print $h <<EOF;
292 Usage: $0 [OPTIONS]
293 Extract translatable strings from given HTML::Template input files.
295 Input file location:
296 -f, --files-from=FILE Get list of input files from FILE
297 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
299 Output file location:
300 -o, --output=FILE Write output to specified file
302 HTML::Template options:
303 -a, --extract-all Extract all strings
304 --pedantic-warnings Issue warnings even for detected problems
305 which are likely to be harmless
307 Output details:
308 -s, --sort-output generate sorted output
309 -F, --sort-by-file sort output by file location
310 -v, --verbose explain what is being done
312 Informative output:
313 --help Display this help and exit
315 Try `perldoc $0' for perhaps more information.
317 exit($exitcode);
320 ###############################################################################
322 sub usage_error {
323 print STDERR "$_[0]\n" if @_;
324 print STDERR "Try `$0 --help' for more information.\n";
325 exit(-1);
328 ###############################################################################
330 Getopt::Long::config qw( bundling no_auto_abbrev );
331 GetOptions(
332 'a|extract-all' => \$extract_all_p,
333 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
334 'convert-from=s' => \$convert_from,
335 'D|directory=s' => \$directory,
336 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
337 'f|files-from=s' => \$files_from,
338 'I|input-charset=s' => \$charset_in, # INTERNAL
339 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
340 'O|output-charset=s' => \$charset_out, # INTERNAL
341 'output|o=s' => \$output,
342 'po-mode' => \$po_mode_p, # INTERNAL
343 's|sort-output' => sub { $sort = 's' },
344 'F|sort-by-file' => sub { $sort = 'F' },
345 'v|verbose' => \$verbose_p,
346 'help' => sub { usage(0) },
347 ) || usage_error;
349 VerboseWarnings::set_application_name $0;
350 VerboseWarnings::set_pedantic_mode $pedantic_p;
352 usage_error('Missing mandatory option -f')
353 unless defined $files_from || defined $convert_from;
354 $directory = '.' unless defined $directory;
356 usage_error('You cannot specify both --convert-from and --files-from')
357 if defined $convert_from && defined $files_from;
359 if (defined $output && $output ne '-') {
360 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
361 open($OUTPUT, '>', $output) || die "$output: $!\n";
362 } else {
363 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
364 open($OUTPUT, ">&STDOUT");
367 if (defined $files_from) {
368 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
369 open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
370 while (<$INPUT>) {
371 chomp;
372 my $input = /^\//? $_: "$directory/$_";
373 my $h = TmplTokenizer->new( $input );
374 $h->set_allow_cformat( 1 );
375 VerboseWarnings::set_input_file_name $input;
376 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
377 text_extract( $h );
379 close $INPUT;
380 } else {
381 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
382 convert_translation_file;
384 generate_po_file;
386 warn "This input will not work with Mozilla standards-compliant mode\n", undef
387 if TmplTokenizer::syntaxerror_p;
390 exit(-1) if TmplTokenizer::fatal_p;
392 ###############################################################################
394 =head1 DESCRIPTION
396 This is an experimental script based on the modularized
397 text-extract2.pl script. It has behaviour similar to
398 xgettext(1), and generates gettext-compatible output files.
400 A gettext-like format provides the following advantages:
402 =over
404 =item -
406 Translation to non-English-like languages with different word
407 order: gettext's c-format strings can theoretically be
408 emulated if we are able to do some analysis on the .tt input
409 and treat <TMPL_VAR> in a way similar to %s.
411 =item -
413 Context for the extracted strings: the gettext format provides
414 the filenames and line numbers where each string can be found.
415 The translator can read the source file and see the context,
416 in case the string by itself can mean several different things.
418 =item -
420 Place for the translator to add comments about the translations.
422 =item -
424 Gettext-compatible tools, if any, might be usable if we adopt
425 the gettext format.
427 =back
429 This script has already been in use for over a year and should
430 be reasonable stable. Nevertheless, it is still somewhat
431 experimental and there are still some issues.
433 Please refer to the explanation in tmpl_process3 for further
434 details.
436 If you want to generate GNOME-style POTFILES.in files, such
437 files (passed to -f) can be generated thus:
439 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
440 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
441 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
442 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
444 This is, however, quite pointless, because the "create" and
445 "update" actions have already been implemented in tmpl_process3.pl.
447 =head2 Strings inside JavaScript
449 In the SCRIPT elements, the script will attempt to scan for
450 _("I<string literal>") patterns, and extract the I<string literal>
451 as a translatable string.
453 Note that the C-like _(...) notation is required.
455 The JavaScript must actually define a _ function
456 so that the code remains correct JavaScript.
457 A suitable definition of such a function can be
459 function _(s) { return s } // dummy function for gettext
461 =head1 SEE ALSO
463 tmpl_process3.pl,
464 xgettext(1),
465 Locale::PO(3),
466 translator_doc.txt
468 =head1 BUGS
470 There probably are some. Bugs related to scanning of <INPUT>
471 tags seem to be especially likely to be present.
473 Its diagnostics are probably too verbose.
475 When a <TMPL_VAR> within a JavaScript-related attribute is
476 detected, the script currently displays no warnings at all.
477 It might be good to display some kind of warning.
479 Its sort order (-s option) seems to be different than the real
480 xgettext(1)'s sort option. This will result in translation
481 strings inside the generated PO file spuriously moving about
482 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
484 If a Javascript string has leading spaces, it will
485 generate strings with spurious leading spaces,
486 leading to failure to match the strings when actually generating
487 translated files.
489 =cut