misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
   6
   7 =cut
   8
   9 use FindBin;
  10 use lib $FindBin::Bin;
  11
  12 use strict;
  13 use warnings;
  14 use Getopt::Long;
  15 use POSIX;
  16 use Locale::PO;
  17 use TmplTokenizer;
  18 use VerboseWarnings;
  19
  20 use vars qw( $convert_from );
  21 use vars qw( $files_from $directory $output $sort );
  22 use vars qw( $extract_all_p );
  23 use vars qw( $pedantic_p );
  24 use vars qw( %text %translation );
  25 use vars qw( $charset_in $charset_out );
  26 use vars qw( $disable_fuzzy_p );
  27 use vars qw( $verbose_p );
  28 use vars qw( $po_mode_p );
  29
  30 our $OUTPUT;
  31
  32 ###############################################################################
  33
  34 sub string_negligible_p {
  35     my($t) = @_;                                # a string
  36     # Don't emit pure whitespace, pure numbers, pure punctuation,
  37     # single letters, or TMPL_VAR's.
  38     # Punctuation should arguably be translated. But without context
  39     # they are untranslatable. Note that $t is a string, not a token object.
  40     return !$extract_all_p && (
  41                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  42             || $t =~ /^\d+$/                    # purely digits
  43             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  44             || $t =~ /^[A-Za-z]$/               # single letters
  45             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  46         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  47         || $t =~ /^\s*<\?.*\?>/                               # ignore xml prolog
  48         )
  49 }
  50
  51 sub token_negligible_p {
  52     my ($x) = @_;
  53     my $t = $x->type;
  54     return !$extract_all_p && (
  55           $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
  56         : $t == C4::TmplTokenType::DIRECTIVE() ? 1
  57         : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
  58         && join(
  59             '',
  60             map {
  61                 my $t = $_->type;
  62                     $t == C4::TmplTokenType::DIRECTIVE() ? '1'
  63                   : $t == C4::TmplTokenType::TAG()       ? ''
  64                   : token_negligible_p($_)               ? ''
  65                   : '1'
  66             } @{ $x->children }
  67         ) eq ''
  68     );
  69 }
  70
  71 ###############################################################################
  72
  73 sub remember {
  74     my($token, $string) = @_;
  75     # If we determine that the string is negligible, don't bother to remember
  76     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  77         my $key = TmplTokenizer::string_canon( $string );
  78         $text{$key} = [] unless defined $text{$key};
  79         push @{$text{$key}}, $token;
  80     }
  81 }
  82
  83 ###############################################################################
  84
  85 sub string_list {
  86     my @t = keys %text;
  87     # The real gettext tools seems to sort case sensitively; I don't know why
  88     @t = sort { $a cmp $b } @t if $sort eq 's';
  89     @t = sort {
  90             my @aa = sort { $a->pathname cmp $b->pathname
  91                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  92             my @bb = sort { $a->pathname cmp $b->pathname
  93                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  94             $aa[0]->pathname cmp $bb[0]->pathname
  95                     || $aa[0]->line_number <=> $bb[0]->line_number;
  96         } @t if $sort eq 'F';
  97     return @t;
  98 }
  99
 100   ###############################################################################
 101
 102 sub text_extract {
 103     my($h) = @_;
 104     for (;;) {
 105         my $s = TmplTokenizer::next_token($h);
 106         last unless defined $s;
 107         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 108         if ($kind eq C4::TmplTokenType::TEXT) {
 109             if ($t =~ /\S/s && $t !~ /<!/){
 110                 remember( $s, $t );
 111             }
 112         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 113             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 114                 remember( $s, $s->form );
 115             }
 116         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 117             # value [tag=input], meta
 118             my $tag;
 119             $tag = lc($1) if $t =~ /^<(\S+)/s;
 120             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
 121                 if ($attr->{$a}) {
 122                     next if $a eq 'label' && $tag ne 'optgroup';
 123                     next if $a eq 'content' && $tag ne 'meta';
 124                     next if $a eq 'value' && ($tag ne 'input'
 125                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 126                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 127                     $val = TmplTokenizer::trim($val);
 128                     # for selected attributes replace '[%..%]' with '%s' globally
 129                     if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
 130                         $val =~ s/\[\%.*?\%\]/\%s/g;
 131                     }
 132                     # save attribute text for translation
 133                     remember( $s, $val ) if $val =~ /\S/s;
 134                 }
 135             }
 136         } elsif ($s->has_js_data) {
 137             for my $t (@{$s->js_data}) {
 138               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 139             }
 140         }
 141     }
 142 }
 143
 144 ###############################################################################
 145
 146 sub generate_strings_list {
 147     # Emit all extracted strings.
 148     for my $t (string_list) {
 149         printf $OUTPUT "%s\n", $t;
 150     }
 151 }
 152
 153 ###############################################################################
 154
 155 sub generate_po_file {
 156     # We don't emit the Plural-Forms header; it's meaningless for us
 157     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 158     $pot_charset = TmplTokenizer::charset_canon($pot_charset);
 159     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 160     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 161     my $time_pot = $time;
 162     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 163     print $OUTPUT <<EOF;
 164 # SOME DESCRIPTIVE TITLE.
 165 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 166 # This file is distributed under the same license as the PACKAGE package.
 167 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 168 #
 169 EOF
 170     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 171 #, fuzzy
 172 EOF
 173     print $OUTPUT <<EOF;
 174 msgid ""
 175 msgstr ""
 176 "Project-Id-Version: PACKAGE VERSION\\n"
 177 "POT-Creation-Date: $time_pot\\n"
 178 "PO-Revision-Date: $time_po\\n"
 179 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 180 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 181 "MIME-Version: 1.0\\n"
 182 "Content-Type: text/plain; charset=$pot_charset\\n"
 183 "Content-Transfer-Encoding: 8bit\\n"
 184
 185 EOF
 186     my $directory_re = quotemeta("$directory/");
 187     for my $t (string_list) {
 188         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 189             my($token, $n) = ($text{$t}->[0], 0);
 190         printf $OUTPUT "#. For the first occurrence,\n"
 191                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 192             for my $param ($token->parameters_and_fields) {
 193                 $n += 1;
 194                 my $type = $param->type;
 195                 my $subtype = ($type == C4::TmplTokenType::TAG
 196                         && $param->string =~ /^<input\b/is?
 197                                 $param->attributes->{'type'}->[1]: undef);
 198                 my $fmt = TmplTokenizer::_formalize( $param );
 199                 $fmt =~ s/^%/%$n\$/;
 200                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 201 #                   $type = "Template::Toolkit Directive";
 202                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 203                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 204                             $2: undef;
 205             printf $OUTPUT "#. %s: %s\n", $fmt,
 206                         "$type" . (defined $name? " name=$name": '');
 207                 } else {
 208                     my $name = $param->attributes->{'name'};
 209             my $value;
 210             $value = $param->attributes->{'value'}
 211                             unless $subtype =~ /^(?:text)$/;
 212             printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 213                             . (defined $name?  " name=$name->[1]": '')
 214                             . (defined $value? " value=$value->[1]": '');
 215                 }
 216             }
 217         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 218             my($token) = ($text{$t}->[0]);
 219         printf $OUTPUT "#. For the first occurrence,\n"
 220                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 221             if ($token->string =~ /^<meta\b/is) {
 222                 my $type = $token->attributes->{'http-equiv'}->[1];
 223         print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 224             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 225                 my $tag = uc($1);
 226                 my $type = (lc($tag) eq 'input'?
 227                         $token->attributes->{'type'}: undef);
 228                 my $name = $token->attributes->{'name'};
 229         printf $OUTPUT "#. %s\n", $tag
 230                     . (defined $type? " type=$type->[1]": '')
 231                     . (defined $name? " name=$name->[1]": '');
 232             }
 233         } elsif ($text{$t}->[0]->has_js_data) {
 234         printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 235         printf $OUTPUT "#. SCRIPT\n";
 236         }
 237         my $cformat_p;
 238         for my $token (@{$text{$t}}) {
 239             my $pathname = $token->pathname;
 240             $pathname =~ s/^$directory_re//os;
 241         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 242         printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 243                     if defined $pathname && defined $token->line_number;
 244             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 245         }
 246         printf $OUTPUT "#, c-format\n" if $cformat_p;
 247         printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po(
 248             TmplTokenizer::string_canon(
 249                 TmplTokenizer::charset_convert($t, $charset_in, $charset_out)
 250             )
 251         );
 252         printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 253                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 254     }
 255 }
 256
 257 ###############################################################################
 258
 259 sub convert_translation_file {
 260     open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
 261     VerboseWarnings::set_input_file_name($convert_from);
 262     while (<$INPUT>) {
 263         chomp;
 264         my($msgid, $msgstr) = split(/\t/);
 265         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 266                 unless defined $msgstr;
 267
 268         # Fixup some of the bad strings
 269         $msgid =~ s/^SELECTED>//;
 270
 271         # Create dummy token
 272         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 273         remember( $token, $msgid );
 274         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 275         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 276
 277         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 278         my $candidate = TmplTokenizer::charset_canon($2);
 279             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 280                     if defined $charset_in && $charset_in ne $candidate;
 281             $charset_in = $candidate;
 282         }
 283         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 284         my $candidate = TmplTokenizer::charset_canon($2);
 285             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 286                     if defined $charset_out && $charset_out ne $candidate;
 287             $charset_out = $candidate;
 288         }
 289     }
 290     # The following assumption is correct; that's what HTML::Template assumes
 291     if (!defined $charset_in) {
 292         $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8');
 293         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 294     }
 295 }
 296
 297 ###############################################################################
 298
 299 sub usage {
 300     my($exitcode) = @_;
 301     my $h = $exitcode? *STDERR: *STDOUT;
 302     print $h <<EOF;
 303 Usage: $0 [OPTIONS]
 304 Extract translatable strings from given HTML::Template input files.
 305
 306 Input file location:
 307   -f, --files-from=FILE          Get list of input files from FILE
 308   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 309
 310 Output file location:
 311   -o, --output=FILE              Write output to specified file
 312
 313 HTML::Template options:
 314   -a, --extract-all              Extract all strings
 315       --pedantic-warnings        Issue warnings even for detected problems
 316                                  which are likely to be harmless
 317
 318 Output details:
 319   -s, --sort-output              generate sorted output
 320   -F, --sort-by-file             sort output by file location
 321   -v, --verbose                  explain what is being done
 322
 323 Informative output:
 324       --help                     Display this help and exit
 325
 326 Try `perldoc $0' for perhaps more information.
 327 EOF
 328     exit($exitcode);
 329 }
 330
 331 ###############################################################################
 332
 333 sub usage_error {
 334     print STDERR "$_[0]\n" if @_;
 335     print STDERR "Try `$0 --help' for more information.\n";
 336     exit(-1);
 337 }
 338
 339 ###############################################################################
 340
 341 Getopt::Long::config qw( bundling no_auto_abbrev );
 342 GetOptions(
 343     'a|extract-all'                     => \$extract_all_p,
 344     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 345     'convert-from=s'                    => \$convert_from,
 346     'D|directory=s'                     => \$directory,
 347     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 348     'f|files-from=s'                    => \$files_from,
 349     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 350     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 351     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 352     'output|o=s'                        => \$output,
 353     'po-mode'                           => \$po_mode_p,         # INTERNAL
 354     's|sort-output'                     => sub { $sort = 's' },
 355     'F|sort-by-file'                    => sub { $sort = 'F' },
 356     'v|verbose'                         => \$verbose_p,
 357     'help'                              => sub { usage(0) },
 358 ) || usage_error;
 359
 360 VerboseWarnings::set_application_name($0);
 361 VerboseWarnings::set_pedantic_mode($pedantic_p);
 362
 363 usage_error('Missing mandatory option -f')
 364         unless defined $files_from || defined $convert_from;
 365 $directory = '.' unless defined $directory;
 366
 367 usage_error('You cannot specify both --convert-from and --files-from')
 368         if defined $convert_from && defined $files_from;
 369
 370 if (defined $output && $output ne '-') {
 371     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 372     open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
 373 } else {
 374     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 375     open($OUTPUT, ">&STDOUT");
 376 }
 377
 378 if (defined $files_from) {
 379     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 380     open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
 381     while (<$INPUT>) {
 382         chomp;
 383         my $input = /^\//? $_: "$directory/$_";
 384         my $h = TmplTokenizer->new( $input );
 385         $h->set_allow_cformat( 1 );
 386     VerboseWarnings::set_input_file_name($input);
 387         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 388         text_extract( $h );
 389     }
 390     close $INPUT;
 391 } else {
 392     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 393     convert_translation_file;
 394 }
 395 generate_po_file;
 396
 397 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 398         if TmplTokenizer::syntaxerror_p;
 399
 400
 401 exit(-1) if TmplTokenizer::fatal_p;
 402
 403 ###############################################################################
 404
 405 =head1 DESCRIPTION
 406
 407 This script has behaviour similar to
 408 xgettext(1), and generates gettext-compatible output files.
 409
 410 A gettext-like format provides the following advantages:
 411
 412 =over
 413
 414 =item -
 415
 416 Translation to non-English-like languages with different word
 417 order:  gettext's c-format strings can theoretically be
 418 emulated if we are able to do some analysis on the .tt input
 419 and treat <TMPL_VAR> in a way similar to %s.
 420
 421 =item -
 422
 423 Context for the extracted strings:  the gettext format provides
 424 the filenames and line numbers where each string can be found.
 425 The translator can read the source file and see the context,
 426 in case the string by itself can mean several different things.
 427
 428 =item -
 429
 430 Place for the translator to add comments about the translations.
 431
 432 =item -
 433
 434 Gettext-compatible tools, if any, might be usable if we adopt
 435 the gettext format.
 436
 437 =back
 438
 439 This script has already been in use for over a year and should
 440 be reasonable stable. Nevertheless, it is still somewhat
 441 experimental and there are still some issues.
 442
 443 Please refer to the explanation in tmpl_process3 for further
 444 details.
 445
 446 If you want to generate GNOME-style POTFILES.in files, such
 447 files (passed to -f) can be generated thus:
 448
 449     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 450         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 451     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 452         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 453
 454 This is, however, quite pointless, because the "create" and
 455 "update" actions have already been implemented in tmpl_process3.pl.
 456
 457 =head2 Strings inside JavaScript
 458
 459 In the SCRIPT elements, the script will attempt to scan for
 460 _("I<string literal>") patterns, and extract the I<string literal>
 461 as a translatable string.
 462
 463 Note that the C-like _(...) notation is required.
 464
 465 The JavaScript must actually define a _ function
 466 so that the code remains correct JavaScript.
 467 A suitable definition of such a function can be
 468
 469         function _(s) { return s } // dummy function for gettext
 470
 471 =head1 SEE ALSO
 472
 473 tmpl_process3.pl,
 474 xgettext(1),
 475 Locale::PO(3),
 476 translator_doc.txt
 477
 478 =head1 BUGS
 479
 480 There probably are some. Bugs related to scanning of <INPUT>
 481 tags seem to be especially likely to be present.
 482
 483 Its diagnostics are probably too verbose.
 484
 485 When a <TMPL_VAR> within a JavaScript-related attribute is
 486 detected, the script currently displays no warnings at all.
 487 It might be good to display some kind of warning.
 488
 489 Its sort order (-s option) seems to be different than the real
 490 xgettext(1)'s sort option. This will result in translation
 491 strings inside the generated PO file spuriously moving about
 492 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 493
 494 If a Javascript string has leading spaces, it will
 495 generate strings with spurious leading spaces,
 496 leading to failure to match the strings when actually generating
 497 translated files.
 498
 499 =cut