5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
10 use lib
$FindBin::Bin
;
20 use vars
qw( $convert_from );
21 use vars qw( $files_from $directory $output $sort );
22 use vars qw( $extract_all_p );
23 use vars qw( $pedantic_p );
24 use vars qw( %text %translation );
25 use vars qw( $charset_in $charset_out );
26 use vars qw( $disable_fuzzy_p );
27 use vars qw( $verbose_p );
28 use vars qw( $po_mode_p );
32 ###############################################################################
34 sub string_negligible_p {
35 my($t) = @_; # a string
36 # Don't emit pure whitespace, pure numbers, pure punctuation,
37 # single letters, or TMPL_VAR's.
38 # Punctuation should arguably be translated. But without context
39 # they are untranslatable. Note that $t is a string, not a token object.
40 return !$extract_all_p && (
41 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
42 || $t =~ /^\d+$/ # purely digits
43 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
44 || $t =~ /^[A-Za-z]$/ # single letters
45 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
46 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
50 sub token_negligible_p {
53 return !$extract_all_p && (
54 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
55 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
56 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
61 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
62 : $t == C4::TmplTokenType::TAG() ? ''
63 : token_negligible_p($_) ? ''
70 ###############################################################################
73 my($token, $string) = @_;
74 # If we determine that the string is negligible, don't bother to remember
75 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
76 my $key = TmplTokenizer::string_canon( $string );
77 $text{$key} = [] unless defined $text{$key};
78 push @{$text{$key}}, $token;
82 ###############################################################################
86 # The real gettext tools seems to sort case sensitively; I don't know why
87 @t = sort { $a cmp $b } @t if $sort eq 's';
89 my @aa = sort { $a->pathname cmp $b->pathname
90 || $a->line_number <=> $b->line_number } @{$text{$a}};
91 my @bb = sort { $a->pathname cmp $b->pathname
92 || $a->line_number <=> $b->line_number } @{$text{$b}};
93 $aa[0]->pathname cmp $bb[0]->pathname
94 || $aa[0]->line_number <=> $bb[0]->line_number;
99 ###############################################################################
104 my $s = TmplTokenizer::next_token $h;
105 last unless defined $s;
106 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
107 if ($kind eq C4::TmplTokenType::TEXT) {
108 if ($t =~ /\S/s && $t !~ /<!/){
111 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
112 if ($s->form =~ /\S/s && $s->form !~ /<!/){
113 remember( $s, $s->form );
115 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
116 # value [tag=input], meta
118 $tag = lc($1) if $t =~ /^<(\S+)/s;
119 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
121 next if $a eq 'label' && $tag ne 'optgroup';
122 next if $a eq 'content' && $tag ne 'meta';
123 next if $a eq 'value' && ($tag ne 'input'
124 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
125 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
126 $val = TmplTokenizer::trim $val;
127 # for selected attributes replace '[%..%]' with '%s' globally
128 if ( $a =~ /title|value|alt|content|placeholder/ ) {
129 $val =~ s/\[\%.*?\%\]/\%s/g;
131 # save attribute text for translation
132 remember( $s, $val ) if $val =~ /\S/s;
135 } elsif ($s->has_js_data) {
136 for my $t (@{$s->js_data}) {
137 remember( $s, $t->[3] ) if $t->[0]; # FIXME
143 ###############################################################################
145 sub generate_strings_list {
146 # Emit all extracted strings.
147 for my $t (string_list) {
148 printf $OUTPUT "%s\n", $t;
152 ###############################################################################
154 sub generate_po_file {
155 # We don't emit the Plural-Forms header; it's meaningless for us
156 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
157 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
158 # Time stamps aren't exactly right semantically. I don't know how to fix it.
159 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
160 my $time_pot = $time;
161 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
163 # SOME DESCRIPTIVE TITLE.
164 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
165 # This file is distributed under the same license as the PACKAGE package.
166 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
169 print $OUTPUT <<EOF unless $disable_fuzzy_p;
175 "Project-Id-Version: PACKAGE VERSION\\n"
176 "POT-Creation-Date: $time_pot\\n"
177 "PO-Revision-Date: $time_po\\n"
178 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
179 "Language-Team: LANGUAGE <LL\@li.org>\\n"
180 "MIME-Version: 1.0\\n"
181 "Content-Type: text/plain; charset=$pot_charset\\n"
182 "Content-Transfer-Encoding: 8bit\\n"
185 my $directory_re = quotemeta("$directory/");
186 for my $t (string_list
) {
187 if ($text{$t}->[0]->type == C4
::TmplTokenType
::TEXT_PARAMETRIZED
) {
188 my($token, $n) = ($text{$t}->[0], 0);
189 printf $OUTPUT "#. For the first occurrence,\n"
190 if @
{$text{$t}} > 1 && $token->parameters_and_fields > 0;
191 for my $param ($token->parameters_and_fields) {
193 my $type = $param->type;
194 my $subtype = ($type == C4
::TmplTokenType
::TAG
195 && $param->string =~ /^<input\b/is?
196 $param->attributes->{'type'}->[1]: undef);
197 my $fmt = TmplTokenizer
::_formalize
( $param );
199 if ($type == C4
::TmplTokenType
::DIRECTIVE
) {
200 # $type = "Template::Toolkit Directive";
201 $type = $param->string =~ /\[%(.*?)%\]/is?
$1: 'ERROR';
202 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
204 printf $OUTPUT "#. %s: %s\n", $fmt,
205 "$type" . (defined $name?
" name=$name": '');
207 my $name = $param->attributes->{'name'};
209 $value = $param->attributes->{'value'}
210 unless $subtype =~ /^(?:text)$/;
211 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
212 . (defined $name?
" name=$name->[1]": '')
213 . (defined $value?
" value=$value->[1]": '');
216 } elsif ($text{$t}->[0]->type == C4
::TmplTokenType
::TAG
) {
217 my($token) = ($text{$t}->[0]);
218 printf $OUTPUT "#. For the first occurrence,\n"
219 if @
{$text{$t}} > 1 && $token->parameters_and_fields > 0;
220 if ($token->string =~ /^<meta\b/is) {
221 my $type = $token->attributes->{'http-equiv'}->[1];
222 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
223 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
225 my $type = (lc($tag) eq 'input'?
226 $token->attributes->{'type'}: undef);
227 my $name = $token->attributes->{'name'};
228 printf $OUTPUT "#. %s\n", $tag
229 . (defined $type?
" type=$type->[1]": '')
230 . (defined $name?
" name=$name->[1]": '');
232 } elsif ($text{$t}->[0]->has_js_data) {
233 printf $OUTPUT "#. For the first occurrence,\n" if @
{$text{$t}} > 1;
234 printf $OUTPUT "#. SCRIPT\n";
237 for my $token (@
{$text{$t}}) {
238 my $pathname = $token->pathname;
239 $pathname =~ s/^$directory_re//os;
240 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
241 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
242 if defined $pathname && defined $token->line_number;
243 $cformat_p = 1 if $token->type == C4
::TmplTokenType
::TEXT_PARAMETRIZED
;
245 printf $OUTPUT "#, c-format\n" if $cformat_p;
246 printf $OUTPUT "msgid %s\n", TmplTokenizer
::quote_po
247 TmplTokenizer
::string_canon
248 TmplTokenizer
::charset_convert
$t, $charset_in, $charset_out;
249 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
250 TmplTokenizer
::quote_po
( $translation{$t} ): "\"\"");
254 ###############################################################################
256 sub convert_translation_file
{
257 open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
258 VerboseWarnings
::set_input_file_name
$convert_from;
261 my($msgid, $msgstr) = split(/\t/);
262 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
263 unless defined $msgstr;
265 # Fixup some of the bad strings
266 $msgid =~ s/^SELECTED>//;
269 my $token = TmplToken
->new( $msgid, C4
::TmplTokenType
::UNKNOWN
, undef, undef );
270 remember
( $token, $msgid );
271 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
272 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
274 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
275 my $candidate = TmplTokenizer
::charset_canon
$2;
276 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
277 if defined $charset_in && $charset_in ne $candidate;
278 $charset_in = $candidate;
280 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
281 my $candidate = TmplTokenizer
::charset_canon
$2;
282 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
283 if defined $charset_out && $charset_out ne $candidate;
284 $charset_out = $candidate;
287 # The following assumption is correct; that's what HTML::Template assumes
288 if (!defined $charset_in) {
289 $charset_in = $charset_out = TmplTokenizer
::charset_canon
'utf-8';
290 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
294 ###############################################################################
298 my $h = $exitcode?
*STDERR
: *STDOUT
;
301 Extract translatable strings from given HTML::Template input files.
304 -f, --files-from=FILE Get list of input files from FILE
305 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
307 Output file location:
308 -o, --output=FILE Write output to specified file
310 HTML::Template options:
311 -a, --extract-all Extract all strings
312 --pedantic-warnings Issue warnings even for detected problems
313 which are likely to be harmless
316 -s, --sort-output generate sorted output
317 -F, --sort-by-file sort output by file location
318 -v, --verbose explain what is being done
321 --help Display this help and exit
323 Try `perldoc $0' for perhaps more information.
328 ###############################################################################
331 print STDERR
"$_[0]\n" if @_;
332 print STDERR
"Try `$0 --help' for more information.\n";
336 ###############################################################################
338 Getopt
::Long
::config
qw( bundling no_auto_abbrev );
340 'a|extract-all' => \
$extract_all_p,
341 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
342 'convert-from=s' => \
$convert_from,
343 'D|directory=s' => \
$directory,
344 'disable-fuzzy' => \
$disable_fuzzy_p, # INTERNAL
345 'f|files-from=s' => \
$files_from,
346 'I|input-charset=s' => \
$charset_in, # INTERNAL
347 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
348 'O|output-charset=s' => \
$charset_out, # INTERNAL
349 'output|o=s' => \
$output,
350 'po-mode' => \
$po_mode_p, # INTERNAL
351 's|sort-output' => sub { $sort = 's' },
352 'F|sort-by-file' => sub { $sort = 'F' },
353 'v|verbose' => \
$verbose_p,
354 'help' => sub { usage
(0) },
357 VerboseWarnings
::set_application_name
$0;
358 VerboseWarnings
::set_pedantic_mode
$pedantic_p;
360 usage_error
('Missing mandatory option -f')
361 unless defined $files_from || defined $convert_from;
362 $directory = '.' unless defined $directory;
364 usage_error
('You cannot specify both --convert-from and --files-from')
365 if defined $convert_from && defined $files_from;
367 if (defined $output && $output ne '-') {
368 print STDERR
"$0: Opening output file \"$output\"\n" if $verbose_p;
369 open($OUTPUT, '>', $output) || die "$output: $!\n";
371 print STDERR
"$0: Outputting to STDOUT...\n" if $verbose_p;
372 open($OUTPUT, ">&STDOUT");
375 if (defined $files_from) {
376 print STDERR
"$0: Opening input file list \"$files_from\"\n" if $verbose_p;
377 open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
380 my $input = /^\//? $_: "$directory/$_";
381 my $h = TmplTokenizer->new( $input );
382 $h->set_allow_cformat( 1 );
383 VerboseWarnings::set_input_file_name $input;
384 print STDERR "$0: Processing file
\"$input\"\n" if $verbose_p;
389 print STDERR "$0: Converting
\"$convert_from\"\n" if $verbose_p;
390 convert_translation_file;
394 warn "This input will
not work with Mozilla standards
-compliant mode
\n", undef
395 if TmplTokenizer::syntaxerror_p;
398 exit(-1) if TmplTokenizer::fatal_p;
400 ###############################################################################
404 This script has behaviour similar to
405 xgettext(1), and generates gettext-compatible output files.
407 A gettext-like format provides the following advantages:
413 Translation to non-English-like languages with different word
414 order: gettext's c-format strings can theoretically be
415 emulated if we are able to do some analysis on the .tt input
416 and treat <TMPL_VAR> in a way similar to %s.
420 Context for the extracted strings: the gettext format provides
421 the filenames and line numbers where each string can be found.
422 The translator can read the source file and see the context,
423 in case the string by itself can mean several different things.
427 Place for the translator to add comments about the translations.
431 Gettext-compatible tools, if any, might be usable if we adopt
436 This script has already been in use for over a year and should
437 be reasonable stable. Nevertheless, it is still somewhat
438 experimental and there are still some issues.
440 Please refer to the explanation in tmpl_process3 for further
443 If you want to generate GNOME-style POTFILES.in files, such
444 files (passed to -f) can be generated thus:
446 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
447 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
448 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
449 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
451 This is, however, quite pointless, because the "create
" and
452 "update
" actions have already been implemented in tmpl_process3.pl.
454 =head2 Strings inside JavaScript
456 In the SCRIPT elements, the script will attempt to scan for
457 _("I
<string literal
>") patterns, and extract the I<string literal>
458 as a translatable string.
460 Note that the C-like _(...) notation is required.
462 The JavaScript must actually define a _ function
463 so that the code remains correct JavaScript.
464 A suitable definition of such a function can be
466 function _(s) { return s } // dummy function for gettext
477 There probably are some. Bugs related to scanning of <INPUT>
478 tags seem to be especially likely to be present.
480 Its diagnostics are probably too verbose.
482 When a <TMPL_VAR> within a JavaScript-related attribute is
483 detected, the script currently displays no warnings at all.
484 It might be good to display some kind of warning.
486 Its sort order (-s option) seems to be different than the real
487 xgettext(1)'s sort option. This will result in translation
488 strings inside the generated PO file spuriously moving about
489 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
491 If a Javascript string has leading spaces, it will
492 generate strings with spurious leading spaces,
493 leading to failure to match the strings when actually generating