Merge remote branch 'kc/new/enh/bug_5283' into kcmaster
[koha.git] / misc / translator / xgettext.pl
blob031f9ba4e6728078f4c1d5dc008860aee781add1
1 #!/usr/bin/perl
3 =head1 NAME
5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
7 =cut
9 use strict;
10 use warnings;
11 use Getopt::Long;
12 use POSIX;
13 use Locale::PO;
14 use TmplTokenizer;
15 use VerboseWarnings;
17 use vars qw( $convert_from );
18 use vars qw( $files_from $directory $output $sort );
19 use vars qw( $extract_all_p );
20 use vars qw( $pedantic_p );
21 use vars qw( %text %translation );
22 use vars qw( $charset_in $charset_out );
23 use vars qw( $disable_fuzzy_p );
24 use vars qw( $verbose_p );
25 use vars qw( $po_mode_p );
27 ###############################################################################
29 sub string_negligible_p ($) {
30 my($t) = @_; # a string
31 # Don't emit pure whitespace, pure numbers, pure punctuation,
32 # single letters, or TMPL_VAR's.
33 # Punctuation should arguably be translated. But without context
34 # they are untranslatable. Note that $t is a string, not a token object.
35 return !$extract_all_p && (
36 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
37 || $t =~ /^\d+$/ # purely digits
38 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
39 || $t =~ /^[A-Za-z]$/ # single letters
43 sub token_negligible_p( $ ) {
44 my($x) = @_;
45 my $t = $x->type;
46 return !$extract_all_p && (
47 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
48 $t == TmplTokenType::DIRECTIVE? 1:
49 $t == TmplTokenType::TEXT_PARAMETRIZED
50 && join( '', map { my $t = $_->type;
51 $t == TmplTokenType::DIRECTIVE?
52 '1': $t == TmplTokenType::TAG?
53 '': token_negligible_p( $_ )?
54 '': '1' } @{$x->children} ) eq '' );
57 ###############################################################################
59 sub remember ($$) {
60 my($token, $string) = @_;
61 # If we determine that the string is negligible, don't bother to remember
62 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
63 my $key = TmplTokenizer::string_canon( $string );
64 $text{$key} = [] unless defined $text{$key};
65 push @{$text{$key}}, $token;
69 ###############################################################################
71 sub string_list () {
72 my @t = keys %text;
73 # The real gettext tools seems to sort case sensitively; I don't know why
74 @t = sort { $a cmp $b } @t if $sort eq 's';
75 @t = sort {
76 my @aa = sort { $a->pathname cmp $b->pathname
77 || $a->line_number <=> $b->line_number } @{$text{$a}};
78 my @bb = sort { $a->pathname cmp $b->pathname
79 || $a->line_number <=> $b->line_number } @{$text{$b}};
80 $aa[0]->pathname cmp $bb[0]->pathname
81 || $aa[0]->line_number <=> $bb[0]->line_number;
82 } @t if $sort eq 'F';
83 return @t;
86 ###############################################################################
88 sub text_extract (*) {
89 my($h) = @_;
90 for (;;) {
91 my $s = TmplTokenizer::next_token $h;
92 last unless defined $s;
93 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
94 if ($kind eq TmplTokenType::TEXT) {
95 remember( $s, $t ) if $t =~ /\S/s;
96 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
97 remember( $s, $s->form ) if $s->form =~ /\S/s;
98 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
99 # value [tag=input], meta
100 my $tag = lc($1) if $t =~ /^<(\S+)/s;
101 for my $a ('alt', 'content', 'title', 'value','label') {
102 if ($attr->{$a}) {
103 next if $a eq 'label' && $tag ne 'optgroup';
104 next if $a eq 'content' && $tag ne 'meta';
105 next if $a eq 'value' && ($tag ne 'input'
106 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
107 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
108 $val = TmplTokenizer::trim $val;
109 remember( $s, $val ) if $val =~ /\S/s;
112 } elsif ($s->has_js_data) {
113 for my $t (@{$s->js_data}) {
114 remember( $s, $t->[3] ) if $t->[0]; # FIXME
120 ###############################################################################
122 sub generate_strings_list () {
123 # Emit all extracted strings.
124 for my $t (string_list) {
125 printf OUTPUT "%s\n", $t;
129 ###############################################################################
131 sub generate_po_file () {
132 # We don't emit the Plural-Forms header; it's meaningless for us
133 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
134 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
135 # Time stamps aren't exactly right semantically. I don't know how to fix it.
136 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
137 my $time_pot = $time;
138 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
139 print OUTPUT <<EOF;
140 # SOME DESCRIPTIVE TITLE.
141 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
142 # This file is distributed under the same license as the PACKAGE package.
143 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
146 print OUTPUT <<EOF unless $disable_fuzzy_p;
147 #, fuzzy
149 print OUTPUT <<EOF;
150 msgid ""
151 msgstr ""
152 "Project-Id-Version: PACKAGE VERSION\\n"
153 "POT-Creation-Date: $time_pot\\n"
154 "PO-Revision-Date: $time_po\\n"
155 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
156 "Language-Team: LANGUAGE <LL\@li.org>\\n"
157 "MIME-Version: 1.0\\n"
158 "Content-Type: text/plain; charset=$pot_charset\\n"
159 "Content-Transfer-Encoding: 8bit\\n"
162 my $directory_re = quotemeta("$directory/");
163 for my $t (string_list) {
164 if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
165 my($token, $n) = ($text{$t}->[0], 0);
166 printf OUTPUT "#. For the first occurrence,\n"
167 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
168 for my $param ($token->parameters_and_fields) {
169 $n += 1;
170 my $type = $param->type;
171 my $subtype = ($type == TmplTokenType::TAG
172 && $param->string =~ /^<input\b/is?
173 $param->attributes->{'type'}->[1]: undef);
174 my $fmt = TmplTokenizer::_formalize( $param );
175 $fmt =~ s/^%/%$n\$/;
176 if ($type == TmplTokenType::DIRECTIVE) {
177 $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
178 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
179 $2: undef;
180 printf OUTPUT "#. %s: %s\n", $fmt,
181 "$type" . (defined $name? " name=$name": '');
182 } else {
183 my $name = $param->attributes->{'name'};
184 my $value = $param->attributes->{'value'}
185 unless $subtype =~ /^(?:text)$/;
186 printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
187 . (defined $name? " name=$name->[1]": '')
188 . (defined $value? " value=$value->[1]": '');
191 } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
192 my($token) = ($text{$t}->[0]);
193 printf OUTPUT "#. For the first occurrence,\n"
194 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
195 if ($token->string =~ /^<meta\b/is) {
196 my $type = $token->attributes->{'http-equiv'}->[1];
197 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
198 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
199 my $tag = uc($1);
200 my $type = (lc($tag) eq 'input'?
201 $token->attributes->{'type'}: undef);
202 my $name = $token->attributes->{'name'};
203 printf OUTPUT "#. %s\n", $tag
204 . (defined $type? " type=$type->[1]": '')
205 . (defined $name? " name=$name->[1]": '');
207 } elsif ($text{$t}->[0]->has_js_data) {
208 printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
209 printf OUTPUT "#. SCRIPT\n";
211 my $cformat_p;
212 for my $token (@{$text{$t}}) {
213 my $pathname = $token->pathname;
214 $pathname =~ s/^$directory_re//os;
215 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
216 if defined $pathname && defined $token->line_number;
217 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
219 printf OUTPUT "#, c-format\n" if $cformat_p;
220 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
221 TmplTokenizer::string_canon
222 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
223 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
224 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
228 ###############################################################################
230 sub convert_translation_file () {
231 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
232 VerboseWarnings::set_input_file_name $convert_from;
233 while (<INPUT>) {
234 chomp;
235 my($msgid, $msgstr) = split(/\t/);
236 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
237 unless defined $msgstr;
239 # Fixup some of the bad strings
240 $msgid =~ s/^SELECTED>//;
242 # Create dummy token
243 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
244 remember( $token, $msgid );
245 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
246 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
248 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
249 my $candidate = TmplTokenizer::charset_canon $2;
250 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
251 if defined $charset_in && $charset_in ne $candidate;
252 $charset_in = $candidate;
254 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
255 my $candidate = TmplTokenizer::charset_canon $2;
256 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
257 if defined $charset_out && $charset_out ne $candidate;
258 $charset_out = $candidate;
261 # The following assumption is correct; that's what HTML::Template assumes
262 if (!defined $charset_in) {
263 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
264 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
268 ###############################################################################
270 sub usage ($) {
271 my($exitcode) = @_;
272 my $h = $exitcode? *STDERR: *STDOUT;
273 print $h <<EOF;
274 Usage: $0 [OPTIONS]
275 Extract translatable strings from given HTML::Template input files.
277 Input file location:
278 -f, --files-from=FILE Get list of input files from FILE
279 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
281 Output file location:
282 -o, --output=FILE Write output to specified file
284 HTML::Template options:
285 -a, --extract-all Extract all strings
286 --pedantic-warnings Issue warnings even for detected problems
287 which are likely to be harmless
289 Output details:
290 -s, --sort-output generate sorted output
291 -F, --sort-by-file sort output by file location
292 -v, --verbose explain what is being done
294 Informative output:
295 --help Display this help and exit
297 Try `perldoc $0' for perhaps more information.
299 exit($exitcode);
302 ###############################################################################
304 sub usage_error (;$) {
305 print STDERR "$_[0]\n" if @_;
306 print STDERR "Try `$0 --help' for more information.\n";
307 exit(-1);
310 ###############################################################################
312 Getopt::Long::config qw( bundling no_auto_abbrev );
313 GetOptions(
314 'a|extract-all' => \$extract_all_p,
315 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
316 'convert-from=s' => \$convert_from,
317 'D|directory=s' => \$directory,
318 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
319 'f|files-from=s' => \$files_from,
320 'I|input-charset=s' => \$charset_in, # INTERNAL
321 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
322 'O|output-charset=s' => \$charset_out, # INTERNAL
323 'output|o=s' => \$output,
324 'po-mode' => \$po_mode_p, # INTERNAL
325 's|sort-output' => sub { $sort = 's' },
326 'F|sort-by-file' => sub { $sort = 'F' },
327 'v|verbose' => \$verbose_p,
328 'help' => sub { usage(0) },
329 ) || usage_error;
331 VerboseWarnings::set_application_name $0;
332 VerboseWarnings::set_pedantic_mode $pedantic_p;
334 usage_error('Missing mandatory option -f')
335 unless defined $files_from || defined $convert_from;
336 $directory = '.' unless defined $directory;
338 usage_error('You cannot specify both --convert-from and --files-from')
339 if defined $convert_from && defined $files_from;
341 if (defined $output && $output ne '-') {
342 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
343 open(OUTPUT, ">$output") || die "$output: $!\n";
344 } else {
345 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
346 open(OUTPUT, ">&STDOUT");
349 if (defined $files_from) {
350 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
351 open(INPUT, "<$files_from") || die "$files_from: $!\n";
352 while (<INPUT>) {
353 chomp;
354 my $input = /^\//? $_: "$directory/$_";
355 my $h = TmplTokenizer->new( $input );
356 $h->set_allow_cformat( 1 );
357 VerboseWarnings::set_input_file_name $input;
358 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
359 text_extract( $h );
361 close INPUT;
362 } else {
363 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
364 convert_translation_file;
366 generate_po_file;
368 warn "This input will not work with Mozilla standards-compliant mode\n", undef
369 if TmplTokenizer::syntaxerror_p;
372 exit(-1) if TmplTokenizer::fatal_p;
374 ###############################################################################
376 =head1 DESCRIPTION
378 This is an experimental script based on the modularized
379 text-extract2.pl script. It has behaviour similar to
380 xgettext(1), and generates gettext-compatible output files.
382 A gettext-like format provides the following advantages:
384 =over
386 =item -
388 Translation to non-English-like languages with different word
389 order: gettext's c-format strings can theoretically be
390 emulated if we are able to do some analysis on the .tmpl input
391 and treat <TMPL_VAR> in a way similar to %s.
393 =item -
395 Context for the extracted strings: the gettext format provides
396 the filenames and line numbers where each string can be found.
397 The translator can read the source file and see the context,
398 in case the string by itself can mean several different things.
400 =item -
402 Place for the translator to add comments about the translations.
404 =item -
406 Gettext-compatible tools, if any, might be usable if we adopt
407 the gettext format.
409 =back
411 This script has already been in use for over a year and should
412 be reasonable stable. Nevertheless, it is still somewhat
413 experimental and there are still some issues.
415 Please refer to the explanation in tmpl_process3 for further
416 details.
418 If you want to generate GNOME-style POTFILES.in files, such
419 files (passed to -f) can be generated thus:
421 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
422 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
423 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
424 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
426 This is, however, quite pointless, because the "create" and
427 "update" actions have already been implemented in tmpl_process3.pl.
429 =head2 Strings inside JavaScript
431 In the SCRIPT elements, the script will attempt to scan for
432 _("I<string literal>") patterns, and extract the I<string literal>
433 as a translatable string.
435 Note that the C-like _(...) notation is required.
437 The JavaScript must actually define a _ function
438 so that the code remains correct JavaScript.
439 A suitable definition of such a function can be
441 function _(s) { return s } // dummy function for gettext
443 =head1 SEE ALSO
445 tmpl_process3.pl,
446 xgettext(1),
447 Locale::PO(3),
448 translator_doc.txt
450 =head1 BUGS
452 There probably are some. Bugs related to scanning of <INPUT>
453 tags seem to be especially likely to be present.
455 Its diagnostics are probably too verbose.
457 When a <TMPL_VAR> within a JavaScript-related attribute is
458 detected, the script currently displays no warnings at all.
459 It might be good to display some kind of warning.
461 Its sort order (-s option) seems to be different than the real
462 xgettext(1)'s sort option. This will result in translation
463 strings inside the generated PO file spuriously moving about
464 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
466 If a Javascript string has leading spaces, it will
467 generate strings with spurious leading spaces,
468 leading to failure to match the strings when actually generating
469 translated files.
471 =cut