corrected init script name for apache2
[koha.git] / misc / translator / xgettext.pl
blobb793867f29235f3e48a8bfb68f637a0ea75b35c9
1 #!/usr/bin/perl
3 =head1 NAME
5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
7 =cut
9 use strict;
10 use Getopt::Long;
11 use POSIX;
12 use Locale::PO;
13 use TmplTokenizer;
14 use VerboseWarnings;
16 use vars qw( $convert_from );
17 use vars qw( $files_from $directory $output $sort );
18 use vars qw( $extract_all_p );
19 use vars qw( $pedantic_p );
20 use vars qw( %text %translation );
21 use vars qw( $charset_in $charset_out );
22 use vars qw( $disable_fuzzy_p );
23 use vars qw( $verbose_p );
24 use vars qw( $po_mode_p );
26 ###############################################################################
28 sub string_negligible_p ($) {
29 my($t) = @_; # a string
30 # Don't emit pure whitespace, pure numbers, pure punctuation,
31 # single letters, or TMPL_VAR's.
32 # Punctuation should arguably be translated. But without context
33 # they are untranslatable. Note that $t is a string, not a token object.
34 return !$extract_all_p && (
35 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
36 || $t =~ /^\d+$/ # purely digits
37 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
38 || $t =~ /^[A-Za-z]$/ # single letters
42 sub token_negligible_p( $ ) {
43 my($x) = @_;
44 my $t = $x->type;
45 return !$extract_all_p && (
46 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
47 $t == TmplTokenType::DIRECTIVE? 1:
48 $t == TmplTokenType::TEXT_PARAMETRIZED
49 && join( '', map { my $t = $_->type;
50 $t == TmplTokenType::DIRECTIVE?
51 '1': $t == TmplTokenType::TAG?
52 '': token_negligible_p( $_ )?
53 '': '1' } @{$x->children} ) eq '' );
56 ###############################################################################
58 sub remember ($$) {
59 my($token, $string) = @_;
60 # If we determine that the string is negligible, don't bother to remember
61 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
62 my $key = TmplTokenizer::string_canon( $string );
63 $text{$key} = [] unless defined $text{$key};
64 push @{$text{$key}}, $token;
68 ###############################################################################
70 sub string_list () {
71 my @t = keys %text;
72 # The real gettext tools seems to sort case sensitively; I don't know why
73 @t = sort { $a cmp $b } @t if $sort eq 's';
74 @t = sort {
75 my @aa = sort { $a->pathname cmp $b->pathname
76 || $a->line_number <=> $b->line_number } @{$text{$a}};
77 my @bb = sort { $a->pathname cmp $b->pathname
78 || $a->line_number <=> $b->line_number } @{$text{$b}};
79 $aa[0]->pathname cmp $bb[0]->pathname
80 || $aa[0]->line_number <=> $bb[0]->line_number;
81 } @t if $sort eq 'F';
82 return @t;
85 ###############################################################################
87 sub text_extract (*) {
88 my($h) = @_;
89 for (;;) {
90 my $s = TmplTokenizer::next_token $h;
91 last unless defined $s;
92 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
93 if ($kind eq TmplTokenType::TEXT) {
94 remember( $s, $t ) if $t =~ /\S/s;
95 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
96 remember( $s, $s->form ) if $s->form =~ /\S/s;
97 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
98 # value [tag=input], meta
99 my $tag = lc($1) if $t =~ /^<(\S+)/s;
100 for my $a ('alt', 'content', 'title', 'value') {
101 if ($attr->{$a}) {
102 next if $a eq 'content' && $tag ne 'meta';
103 next if $a eq 'value' && ($tag ne 'input'
104 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
105 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
106 $val = TmplTokenizer::trim $val;
107 remember( $s, $val ) if $val =~ /\S/s;
110 } elsif ($s->has_js_data) {
111 for my $t (@{$s->js_data}) {
112 remember( $s, $t->[3] ) if $t->[0]; # FIXME
118 ###############################################################################
120 sub generate_strings_list () {
121 # Emit all extracted strings.
122 for my $t (string_list) {
123 printf OUTPUT "%s\n", $t;
127 ###############################################################################
129 sub generate_po_file () {
130 # We don't emit the Plural-Forms header; it's meaningless for us
131 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
132 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
133 # Time stamps aren't exactly right semantically. I don't know how to fix it.
134 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
135 my $time_pot = $time;
136 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
137 print OUTPUT <<EOF;
138 # SOME DESCRIPTIVE TITLE.
139 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
140 # This file is distributed under the same license as the PACKAGE package.
141 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
144 print OUTPUT <<EOF unless $disable_fuzzy_p;
145 #, fuzzy
147 print OUTPUT <<EOF;
148 msgid ""
149 msgstr ""
150 "Project-Id-Version: PACKAGE VERSION\\n"
151 "POT-Creation-Date: $time_pot\\n"
152 "PO-Revision-Date: $time_po\\n"
153 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
154 "Language-Team: LANGUAGE <LL\@li.org>\\n"
155 "MIME-Version: 1.0\\n"
156 "Content-Type: text/plain; charset=$pot_charset\\n"
157 "Content-Transfer-Encoding: 8bit\\n"
160 my $directory_re = quotemeta("$directory/");
161 for my $t (string_list) {
162 if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
163 my($token, $n) = ($text{$t}->[0], 0);
164 printf OUTPUT "#. For the first occurrence,\n"
165 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
166 for my $param ($token->parameters_and_fields) {
167 $n += 1;
168 my $type = $param->type;
169 my $subtype = ($type == TmplTokenType::TAG
170 && $param->string =~ /^<input\b/is?
171 $param->attributes->{'type'}->[1]: undef);
172 my $fmt = TmplTokenizer::_formalize( $param );
173 $fmt =~ s/^%/%$n\$/;
174 if ($type == TmplTokenType::DIRECTIVE) {
175 $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
176 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
177 $2: undef;
178 printf OUTPUT "#. %s: %s\n", $fmt,
179 "$type" . (defined $name? " name=$name": '');
180 } else {
181 my $name = $param->attributes->{'name'};
182 my $value = $param->attributes->{'value'}
183 unless $subtype =~ /^(?:text)$/;
184 printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
185 . (defined $name? " name=$name->[1]": '')
186 . (defined $value? " value=$value->[1]": '');
189 } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
190 my($token) = ($text{$t}->[0]);
191 printf OUTPUT "#. For the first occurrence,\n"
192 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
193 if ($token->string =~ /^<meta\b/is) {
194 my $type = $token->attributes->{'http-equiv'}->[1];
195 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
196 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
197 my $tag = uc($1);
198 my $type = (lc($tag) eq 'input'?
199 $token->attributes->{'type'}: undef);
200 my $name = $token->attributes->{'name'};
201 printf OUTPUT "#. %s\n", $tag
202 . (defined $type? " type=$type->[1]": '')
203 . (defined $name? " name=$name->[1]": '');
205 } elsif ($text{$t}->[0]->has_js_data) {
206 printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
207 printf OUTPUT "#. SCRIPT\n";
209 my $cformat_p;
210 for my $token (@{$text{$t}}) {
211 my $pathname = $token->pathname;
212 $pathname =~ s/^$directory_re//os;
213 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
214 if defined $pathname && defined $token->line_number;
215 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
217 printf OUTPUT "#, c-format\n" if $cformat_p;
218 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
219 TmplTokenizer::string_canon
220 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
221 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
222 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
226 ###############################################################################
228 sub convert_translation_file () {
229 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
230 VerboseWarnings::set_input_file_name $convert_from;
231 while (<INPUT>) {
232 chomp;
233 my($msgid, $msgstr) = split(/\t/);
234 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
235 unless defined $msgstr;
237 # Fixup some of the bad strings
238 $msgid =~ s/^SELECTED>//;
240 # Create dummy token
241 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
242 remember( $token, $msgid );
243 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
244 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
246 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
247 my $candidate = TmplTokenizer::charset_canon $2;
248 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
249 if defined $charset_in && $charset_in ne $candidate;
250 $charset_in = $candidate;
252 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
253 my $candidate = TmplTokenizer::charset_canon $2;
254 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
255 if defined $charset_out && $charset_out ne $candidate;
256 $charset_out = $candidate;
259 # The following assumption is correct; that's what HTML::Template assumes
260 if (!defined $charset_in) {
261 $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
262 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
266 ###############################################################################
268 sub usage ($) {
269 my($exitcode) = @_;
270 my $h = $exitcode? *STDERR: *STDOUT;
271 print $h <<EOF;
272 Usage: $0 [OPTIONS]
273 Extract translatable strings from given HTML::Template input files.
275 Input file location:
276 -f, --files-from=FILE Get list of input files from FILE
277 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
279 Output file location:
280 -o, --output=FILE Write output to specified file
282 HTML::Template options:
283 -a, --extract-all Extract all strings
284 --pedantic-warnings Issue warnings even for detected problems
285 which are likely to be harmless
287 Output details:
288 -s, --sort-output generate sorted output
289 -F, --sort-by-file sort output by file location
290 -v, --verbose explain what is being done
292 Informative output:
293 --help Display this help and exit
295 Try `perldoc $0' for perhaps more information.
297 exit($exitcode);
300 ###############################################################################
302 sub usage_error (;$) {
303 print STDERR "$_[0]\n" if @_;
304 print STDERR "Try `$0 --help' for more information.\n";
305 exit(-1);
308 ###############################################################################
310 Getopt::Long::config qw( bundling no_auto_abbrev );
311 GetOptions(
312 'a|extract-all' => \$extract_all_p,
313 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
314 'convert-from=s' => \$convert_from,
315 'D|directory=s' => \$directory,
316 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
317 'f|files-from=s' => \$files_from,
318 'I|input-charset=s' => \$charset_in, # INTERNAL
319 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
320 'O|output-charset=s' => \$charset_out, # INTERNAL
321 'output|o=s' => \$output,
322 'po-mode' => \$po_mode_p, # INTERNAL
323 's|sort-output' => sub { $sort = 's' },
324 'F|sort-by-file' => sub { $sort = 'F' },
325 'v|verbose' => \$verbose_p,
326 'help' => sub { usage(0) },
327 ) || usage_error;
329 VerboseWarnings::set_application_name $0;
330 VerboseWarnings::set_pedantic_mode $pedantic_p;
332 usage_error('Missing mandatory option -f')
333 unless defined $files_from || defined $convert_from;
334 $directory = '.' unless defined $directory;
336 usage_error('You cannot specify both --convert-from and --files-from')
337 if defined $convert_from && defined $files_from;
339 if (defined $output && $output ne '-') {
340 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
341 open(OUTPUT, ">$output") || die "$output: $!\n";
342 } else {
343 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
344 open(OUTPUT, ">&STDOUT");
347 if (defined $files_from) {
348 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
349 open(INPUT, "<$files_from") || die "$files_from: $!\n";
350 while (<INPUT>) {
351 chomp;
352 my $input = /^\//? $_: "$directory/$_";
353 my $h = TmplTokenizer->new( $input );
354 $h->set_allow_cformat( 1 );
355 VerboseWarnings::set_input_file_name $input;
356 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
357 text_extract( $h );
359 close INPUT;
360 } else {
361 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
362 convert_translation_file;
364 generate_po_file;
366 warn "This input will not work with Mozilla standards-compliant mode\n", undef
367 if TmplTokenizer::syntaxerror_p;
370 exit(-1) if TmplTokenizer::fatal_p;
372 ###############################################################################
374 =head1 DESCRIPTION
376 This is an experimental script based on the modularized
377 text-extract2.pl script. It has behaviour similar to
378 xgettext(1), and generates gettext-compatible output files.
380 A gettext-like format provides the following advantages:
382 =over
384 =item -
386 Translation to non-English-like languages with different word
387 order: gettext's c-format strings can theoretically be
388 emulated if we are able to do some analysis on the .tmpl input
389 and treat <TMPL_VAR> in a way similar to %s.
391 =item -
393 Context for the extracted strings: the gettext format provides
394 the filenames and line numbers where each string can be found.
395 The translator can read the source file and see the context,
396 in case the string by itself can mean several different things.
398 =item -
400 Place for the translator to add comments about the translations.
402 =item -
404 Gettext-compatible tools, if any, might be usable if we adopt
405 the gettext format.
407 =back
409 This script has already been in use for over a year and should
410 be reasonable stable. Nevertheless, it is still somewhat
411 experimental and there are still some issues.
413 Please refer to the explanation in tmpl_process3 for further
414 details.
416 If you want to generate GNOME-style POTFILES.in files, such
417 files (passed to -f) can be generated thus:
419 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
420 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
421 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
422 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
424 This is, however, quite pointless, because the "create" and
425 "update" actions have already been implemented in tmpl_process3.pl.
427 =head2 Strings inside JavaScript
429 In the SCRIPT elements, the script will attempt to scan for
430 _("I<string literal>") patterns, and extract the I<string literal>
431 as a translatable string.
433 Note that the C-like _(...) notation is required.
435 The JavaScript must actually define a _ function
436 so that the code remains correct JavaScript.
437 A suitable definition of such a function can be
439 function _(s) { return s } // dummy function for gettext
441 =head1 SEE ALSO
443 tmpl_process3.pl,
444 xgettext(1),
445 Locale::PO(3),
446 translator_doc.txt
448 =head1 BUGS
450 There probably are some. Bugs related to scanning of <INPUT>
451 tags seem to be especially likely to be present.
453 Its diagnostics are probably too verbose.
455 When a <TMPL_VAR> within a JavaScript-related attribute is
456 detected, the script currently displays no warnings at all.
457 It might be good to display some kind of warning.
459 Its sort order (-s option) seems to be different than the real
460 xgettext(1)'s sort option. This will result in translation
461 strings inside the generated PO file spuriously moving about
462 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
464 If a Javascript string has leading spaces, it will
465 generate strings with spurious leading spaces,
466 leading to failure to match the strings when actually generating
467 translated files.
469 =cut