5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
10 use lib
$FindBin::Bin
;
20 use vars
qw( $convert_from );
21 use vars qw( $files_from $directory $output $sort );
22 use vars qw( $extract_all_p );
23 use vars qw( $pedantic_p );
24 use vars qw( %text %translation );
25 use vars qw( $charset_in $charset_out );
26 use vars qw( $disable_fuzzy_p );
27 use vars qw( $verbose_p );
28 use vars qw( $po_mode_p );
32 ###############################################################################
34 sub string_negligible_p {
35 my($t) = @_; # a string
36 # Don't emit pure whitespace, pure numbers, pure punctuation,
37 # single letters, or TMPL_VAR's.
38 # Punctuation should arguably be translated. But without context
39 # they are untranslatable. Note that $t is a string, not a token object.
40 return !$extract_all_p && (
41 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
42 || $t =~ /^\d+$/ # purely digits
43 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
44 || $t =~ /^[A-Za-z]$/ # single letters
45 || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
46 || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities
47 || $t =~ /^\s*<\?.*\?>/ # ignore xml prolog
51 sub token_negligible_p {
54 return !$extract_all_p && (
55 $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
56 : $t == C4::TmplTokenType::DIRECTIVE() ? 1
57 : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
62 $t == C4::TmplTokenType::DIRECTIVE() ? '1'
63 : $t == C4::TmplTokenType::TAG() ? ''
64 : token_negligible_p($_) ? ''
71 ###############################################################################
74 my($token, $string) = @_;
75 # If we determine that the string is negligible, don't bother to remember
76 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
77 my $key = TmplTokenizer::string_canon( $string );
78 $text{$key} = [] unless defined $text{$key};
79 push @{$text{$key}}, $token;
83 ###############################################################################
87 # The real gettext tools seems to sort case sensitively; I don't know why
88 @t = sort { $a cmp $b } @t if $sort eq 's';
90 my @aa = sort { $a->pathname cmp $b->pathname
91 || $a->line_number <=> $b->line_number } @{$text{$a}};
92 my @bb = sort { $a->pathname cmp $b->pathname
93 || $a->line_number <=> $b->line_number } @{$text{$b}};
94 $aa[0]->pathname cmp $bb[0]->pathname
95 || $aa[0]->line_number <=> $bb[0]->line_number;
100 ###############################################################################
105 my $s = TmplTokenizer::next_token $h;
106 last unless defined $s;
107 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
108 if ($kind eq C4::TmplTokenType::TEXT) {
109 if ($t =~ /\S/s && $t !~ /<!/){
112 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
113 if ($s->form =~ /\S/s && $s->form !~ /<!/){
114 remember( $s, $s->form );
116 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
117 # value [tag=input], meta
119 $tag = lc($1) if $t =~ /^<(\S+)/s;
120 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
122 next if $a eq 'label' && $tag ne 'optgroup';
123 next if $a eq 'content' && $tag ne 'meta';
124 next if $a eq 'value' && ($tag ne 'input'
125 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
126 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
127 $val = TmplTokenizer::trim $val;
128 # for selected attributes replace '[%..%]' with '%s' globally
129 if ( $a =~ /title|value|alt|content|placeholder/ ) {
130 $val =~ s/\[\%.*?\%\]/\%s/g;
132 # save attribute text for translation
133 remember( $s, $val ) if $val =~ /\S/s;
136 } elsif ($s->has_js_data) {
137 for my $t (@{$s->js_data}) {
138 remember( $s, $t->[3] ) if $t->[0]; # FIXME
144 ###############################################################################
146 sub generate_strings_list {
147 # Emit all extracted strings.
148 for my $t (string_list) {
149 printf $OUTPUT "%s\n", $t;
153 ###############################################################################
155 sub generate_po_file {
156 # We don't emit the Plural-Forms header; it's meaningless for us
157 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
158 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
159 # Time stamps aren't exactly right semantically. I don't know how to fix it.
160 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
161 my $time_pot = $time;
162 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
164 # SOME DESCRIPTIVE TITLE.
165 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
166 # This file is distributed under the same license as the PACKAGE package.
167 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
170 print $OUTPUT <<EOF unless $disable_fuzzy_p;
176 "Project-Id-Version: PACKAGE VERSION\\n"
177 "POT-Creation-Date: $time_pot\\n"
178 "PO-Revision-Date: $time_po\\n"
179 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
180 "Language-Team: LANGUAGE <LL\@li.org>\\n"
181 "MIME-Version: 1.0\\n"
182 "Content-Type: text/plain; charset=$pot_charset\\n"
183 "Content-Transfer-Encoding: 8bit\\n"
186 my $directory_re = quotemeta("$directory/");
187 for my $t (string_list
) {
188 if ($text{$t}->[0]->type == C4
::TmplTokenType
::TEXT_PARAMETRIZED
) {
189 my($token, $n) = ($text{$t}->[0], 0);
190 printf $OUTPUT "#. For the first occurrence,\n"
191 if @
{$text{$t}} > 1 && $token->parameters_and_fields > 0;
192 for my $param ($token->parameters_and_fields) {
194 my $type = $param->type;
195 my $subtype = ($type == C4
::TmplTokenType
::TAG
196 && $param->string =~ /^<input\b/is?
197 $param->attributes->{'type'}->[1]: undef);
198 my $fmt = TmplTokenizer
::_formalize
( $param );
200 if ($type == C4
::TmplTokenType
::DIRECTIVE
) {
201 # $type = "Template::Toolkit Directive";
202 $type = $param->string =~ /\[%(.*?)%\]/is?
$1: 'ERROR';
203 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
205 printf $OUTPUT "#. %s: %s\n", $fmt,
206 "$type" . (defined $name?
" name=$name": '');
208 my $name = $param->attributes->{'name'};
210 $value = $param->attributes->{'value'}
211 unless $subtype =~ /^(?:text)$/;
212 printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
213 . (defined $name?
" name=$name->[1]": '')
214 . (defined $value?
" value=$value->[1]": '');
217 } elsif ($text{$t}->[0]->type == C4
::TmplTokenType
::TAG
) {
218 my($token) = ($text{$t}->[0]);
219 printf $OUTPUT "#. For the first occurrence,\n"
220 if @
{$text{$t}} > 1 && $token->parameters_and_fields > 0;
221 if ($token->string =~ /^<meta\b/is) {
222 my $type = $token->attributes->{'http-equiv'}->[1];
223 print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
224 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
226 my $type = (lc($tag) eq 'input'?
227 $token->attributes->{'type'}: undef);
228 my $name = $token->attributes->{'name'};
229 printf $OUTPUT "#. %s\n", $tag
230 . (defined $type?
" type=$type->[1]": '')
231 . (defined $name?
" name=$name->[1]": '');
233 } elsif ($text{$t}->[0]->has_js_data) {
234 printf $OUTPUT "#. For the first occurrence,\n" if @
{$text{$t}} > 1;
235 printf $OUTPUT "#. SCRIPT\n";
238 for my $token (@
{$text{$t}}) {
239 my $pathname = $token->pathname;
240 $pathname =~ s/^$directory_re//os;
241 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
242 printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
243 if defined $pathname && defined $token->line_number;
244 $cformat_p = 1 if $token->type == C4
::TmplTokenType
::TEXT_PARAMETRIZED
;
246 printf $OUTPUT "#, c-format\n" if $cformat_p;
247 printf $OUTPUT "msgid %s\n", TmplTokenizer
::quote_po
248 TmplTokenizer
::string_canon
249 TmplTokenizer
::charset_convert
$t, $charset_in, $charset_out;
250 printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
251 TmplTokenizer
::quote_po
( $translation{$t} ): "\"\"");
255 ###############################################################################
257 sub convert_translation_file
{
258 open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
259 VerboseWarnings
::set_input_file_name
$convert_from;
262 my($msgid, $msgstr) = split(/\t/);
263 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
264 unless defined $msgstr;
266 # Fixup some of the bad strings
267 $msgid =~ s/^SELECTED>//;
270 my $token = TmplToken
->new( $msgid, C4
::TmplTokenType
::UNKNOWN
, undef, undef );
271 remember
( $token, $msgid );
272 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
273 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
275 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
276 my $candidate = TmplTokenizer
::charset_canon
$2;
277 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
278 if defined $charset_in && $charset_in ne $candidate;
279 $charset_in = $candidate;
281 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
282 my $candidate = TmplTokenizer
::charset_canon
$2;
283 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
284 if defined $charset_out && $charset_out ne $candidate;
285 $charset_out = $candidate;
288 # The following assumption is correct; that's what HTML::Template assumes
289 if (!defined $charset_in) {
290 $charset_in = $charset_out = TmplTokenizer
::charset_canon
'utf-8';
291 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
295 ###############################################################################
299 my $h = $exitcode?
*STDERR
: *STDOUT
;
302 Extract translatable strings from given HTML::Template input files.
305 -f, --files-from=FILE Get list of input files from FILE
306 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
308 Output file location:
309 -o, --output=FILE Write output to specified file
311 HTML::Template options:
312 -a, --extract-all Extract all strings
313 --pedantic-warnings Issue warnings even for detected problems
314 which are likely to be harmless
317 -s, --sort-output generate sorted output
318 -F, --sort-by-file sort output by file location
319 -v, --verbose explain what is being done
322 --help Display this help and exit
324 Try `perldoc $0' for perhaps more information.
329 ###############################################################################
332 print STDERR
"$_[0]\n" if @_;
333 print STDERR
"Try `$0 --help' for more information.\n";
337 ###############################################################################
339 Getopt
::Long
::config
qw( bundling no_auto_abbrev );
341 'a|extract-all' => \
$extract_all_p,
342 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
343 'convert-from=s' => \
$convert_from,
344 'D|directory=s' => \
$directory,
345 'disable-fuzzy' => \
$disable_fuzzy_p, # INTERNAL
346 'f|files-from=s' => \
$files_from,
347 'I|input-charset=s' => \
$charset_in, # INTERNAL
348 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
349 'O|output-charset=s' => \
$charset_out, # INTERNAL
350 'output|o=s' => \
$output,
351 'po-mode' => \
$po_mode_p, # INTERNAL
352 's|sort-output' => sub { $sort = 's' },
353 'F|sort-by-file' => sub { $sort = 'F' },
354 'v|verbose' => \
$verbose_p,
355 'help' => sub { usage
(0) },
358 VerboseWarnings
::set_application_name
$0;
359 VerboseWarnings
::set_pedantic_mode
$pedantic_p;
361 usage_error
('Missing mandatory option -f')
362 unless defined $files_from || defined $convert_from;
363 $directory = '.' unless defined $directory;
365 usage_error
('You cannot specify both --convert-from and --files-from')
366 if defined $convert_from && defined $files_from;
368 if (defined $output && $output ne '-') {
369 print STDERR
"$0: Opening output file \"$output\"\n" if $verbose_p;
370 open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
372 print STDERR
"$0: Outputting to STDOUT...\n" if $verbose_p;
373 open($OUTPUT, ">&STDOUT");
376 if (defined $files_from) {
377 print STDERR
"$0: Opening input file list \"$files_from\"\n" if $verbose_p;
378 open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
381 my $input = /^\//? $_: "$directory/$_";
382 my $h = TmplTokenizer->new( $input );
383 $h->set_allow_cformat( 1 );
384 VerboseWarnings::set_input_file_name $input;
385 print STDERR "$0: Processing file
\"$input\"\n" if $verbose_p;
390 print STDERR "$0: Converting
\"$convert_from\"\n" if $verbose_p;
391 convert_translation_file;
395 warn "This input will
not work with Mozilla standards
-compliant mode
\n", undef
396 if TmplTokenizer::syntaxerror_p;
399 exit(-1) if TmplTokenizer::fatal_p;
401 ###############################################################################
405 This script has behaviour similar to
406 xgettext(1), and generates gettext-compatible output files.
408 A gettext-like format provides the following advantages:
414 Translation to non-English-like languages with different word
415 order: gettext's c-format strings can theoretically be
416 emulated if we are able to do some analysis on the .tt input
417 and treat <TMPL_VAR> in a way similar to %s.
421 Context for the extracted strings: the gettext format provides
422 the filenames and line numbers where each string can be found.
423 The translator can read the source file and see the context,
424 in case the string by itself can mean several different things.
428 Place for the translator to add comments about the translations.
432 Gettext-compatible tools, if any, might be usable if we adopt
437 This script has already been in use for over a year and should
438 be reasonable stable. Nevertheless, it is still somewhat
439 experimental and there are still some issues.
441 Please refer to the explanation in tmpl_process3 for further
444 If you want to generate GNOME-style POTFILES.in files, such
445 files (passed to -f) can be generated thus:
447 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
448 -name \*.inc -o -name \*.tt) > opac/POTFILES.in
449 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
450 -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
452 This is, however, quite pointless, because the "create
" and
453 "update
" actions have already been implemented in tmpl_process3.pl.
455 =head2 Strings inside JavaScript
457 In the SCRIPT elements, the script will attempt to scan for
458 _("I
<string literal
>") patterns, and extract the I<string literal>
459 as a translatable string.
461 Note that the C-like _(...) notation is required.
463 The JavaScript must actually define a _ function
464 so that the code remains correct JavaScript.
465 A suitable definition of such a function can be
467 function _(s) { return s } // dummy function for gettext
478 There probably are some. Bugs related to scanning of <INPUT>
479 tags seem to be especially likely to be present.
481 Its diagnostics are probably too verbose.
483 When a <TMPL_VAR> within a JavaScript-related attribute is
484 detected, the script currently displays no warnings at all.
485 It might be good to display some kind of warning.
487 Its sort order (-s option) seems to be different than the real
488 xgettext(1)'s sort option. This will result in translation
489 strings inside the generated PO file spuriously moving about
490 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
492 If a Javascript string has leading spaces, it will
493 generate strings with spurious leading spaces,
494 leading to failure to match the strings when actually generating