misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
   6
   7 =cut
   8
   9 use FindBin;
  10 use lib $FindBin::Bin;
  11
  12 use strict;
  13 use warnings;
  14 use Getopt::Long;
  15 use POSIX;
  16 use Locale::PO;
  17 use TmplTokenizer;
  18 use VerboseWarnings;
  19
  20 use vars qw( $convert_from );
  21 use vars qw( $files_from $directory $output $sort );
  22 use vars qw( $extract_all_p );
  23 use vars qw( $pedantic_p );
  24 use vars qw( %text %translation );
  25 use vars qw( $charset_in $charset_out );
  26 use vars qw( $disable_fuzzy_p );
  27 use vars qw( $verbose_p );
  28 use vars qw( $po_mode_p );
  29
  30 our $OUTPUT;
  31
  32 ###############################################################################
  33
  34 sub string_negligible_p {
  35     my($t) = @_;                                # a string
  36     # Don't emit pure whitespace, pure numbers, pure punctuation,
  37     # single letters, or TMPL_VAR's.
  38     # Punctuation should arguably be translated. But without context
  39     # they are untranslatable. Note that $t is a string, not a token object.
  40     return !$extract_all_p && (
  41                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  42             || $t =~ /^\d+$/                    # purely digits
  43             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  44             || $t =~ /^[A-Za-z]$/               # single letters
  45             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  46         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  47         || $t =~ /^\s*<\?.*\?>/                               # ignore xml prolog
  48         )
  49 }
  50
  51 sub token_negligible_p {
  52     my ($x) = @_;
  53     my $t = $x->type;
  54     return !$extract_all_p && (
  55           $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
  56         : $t == C4::TmplTokenType::DIRECTIVE() ? 1
  57         : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
  58         && join(
  59             '',
  60             map {
  61                 my $t = $_->type;
  62                     $t == C4::TmplTokenType::DIRECTIVE() ? '1'
  63                   : $t == C4::TmplTokenType::TAG()       ? ''
  64                   : token_negligible_p($_)               ? ''
  65                   : '1'
  66             } @{ $x->children }
  67         ) eq ''
  68     );
  69 }
  70
  71 ###############################################################################
  72
  73 sub remember {
  74     my($token, $string) = @_;
  75     # If we determine that the string is negligible, don't bother to remember
  76     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  77         my $key = TmplTokenizer::string_canon( $string );
  78         $text{$key} = [] unless defined $text{$key};
  79         push @{$text{$key}}, $token;
  80     }
  81 }
  82
  83 ###############################################################################
  84
  85 sub string_list {
  86     my @t = keys %text;
  87     # The real gettext tools seems to sort case sensitively; I don't know why
  88     @t = sort { $a cmp $b } @t if $sort eq 's';
  89     @t = sort {
  90             my @aa = sort { $a->pathname cmp $b->pathname
  91                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  92             my @bb = sort { $a->pathname cmp $b->pathname
  93                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  94             $aa[0]->pathname cmp $bb[0]->pathname
  95                     || $aa[0]->line_number <=> $bb[0]->line_number;
  96         } @t if $sort eq 'F';
  97     return @t;
  98 }
  99
 100   ###############################################################################
 101
 102 sub text_extract {
 103     my($h) = @_;
 104     for (;;) {
 105         my $s = TmplTokenizer::next_token $h;
 106         last unless defined $s;
 107         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 108         if ($kind eq C4::TmplTokenType::TEXT) {
 109             if ($t =~ /\S/s && $t !~ /<!/){
 110                 remember( $s, $t );
 111             }
 112         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 113             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 114                 remember( $s, $s->form );
 115             }
 116         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 117             # value [tag=input], meta
 118             my $tag;
 119             $tag = lc($1) if $t =~ /^<(\S+)/s;
 120             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
 121                 if ($attr->{$a}) {
 122                     next if $a eq 'label' && $tag ne 'optgroup';
 123                     next if $a eq 'content' && $tag ne 'meta';
 124                     next if $a eq 'value' && ($tag ne 'input'
 125                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 126                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 127                     $val = TmplTokenizer::trim $val;
 128                     # for selected attributes replace '[%..%]' with '%s' globally
 129                     if ( $a =~ /title|value|alt|content|placeholder/ ) {
 130                         $val =~ s/\[\%.*?\%\]/\%s/g;
 131                     }
 132                     # save attribute text for translation
 133                     remember( $s, $val ) if $val =~ /\S/s;
 134                 }
 135             }
 136         } elsif ($s->has_js_data) {
 137             for my $t (@{$s->js_data}) {
 138               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 139             }
 140         }
 141     }
 142 }
 143
 144 ###############################################################################
 145
 146 sub generate_strings_list {
 147     # Emit all extracted strings.
 148     for my $t (string_list) {
 149         printf $OUTPUT "%s\n", $t;
 150     }
 151 }
 152
 153 ###############################################################################
 154
 155 sub generate_po_file {
 156     # We don't emit the Plural-Forms header; it's meaningless for us
 157     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 158     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 159     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 160     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 161     my $time_pot = $time;
 162     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 163     print $OUTPUT <<EOF;
 164 # SOME DESCRIPTIVE TITLE.
 165 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 166 # This file is distributed under the same license as the PACKAGE package.
 167 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 168 #
 169 EOF
 170     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 171 #, fuzzy
 172 EOF
 173     print $OUTPUT <<EOF;
 174 msgid ""
 175 msgstr ""
 176 "Project-Id-Version: PACKAGE VERSION\\n"
 177 "POT-Creation-Date: $time_pot\\n"
 178 "PO-Revision-Date: $time_po\\n"
 179 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 180 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 181 "MIME-Version: 1.0\\n"
 182 "Content-Type: text/plain; charset=$pot_charset\\n"
 183 "Content-Transfer-Encoding: 8bit\\n"
 184
 185 EOF
 186     my $directory_re = quotemeta("$directory/");
 187     for my $t (string_list) {
 188         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 189             my($token, $n) = ($text{$t}->[0], 0);
 190         printf $OUTPUT "#. For the first occurrence,\n"
 191                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 192             for my $param ($token->parameters_and_fields) {
 193                 $n += 1;
 194                 my $type = $param->type;
 195                 my $subtype = ($type == C4::TmplTokenType::TAG
 196                         && $param->string =~ /^<input\b/is?
 197                                 $param->attributes->{'type'}->[1]: undef);
 198                 my $fmt = TmplTokenizer::_formalize( $param );
 199                 $fmt =~ s/^%/%$n\$/;
 200                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 201 #                   $type = "Template::Toolkit Directive";
 202                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 203                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 204                             $2: undef;
 205             printf $OUTPUT "#. %s: %s\n", $fmt,
 206                         "$type" . (defined $name? " name=$name": '');
 207                 } else {
 208                     my $name = $param->attributes->{'name'};
 209             my $value;
 210             $value = $param->attributes->{'value'}
 211                             unless $subtype =~ /^(?:text)$/;
 212             printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 213                             . (defined $name?  " name=$name->[1]": '')
 214                             . (defined $value? " value=$value->[1]": '');
 215                 }
 216             }
 217         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 218             my($token) = ($text{$t}->[0]);
 219         printf $OUTPUT "#. For the first occurrence,\n"
 220                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 221             if ($token->string =~ /^<meta\b/is) {
 222                 my $type = $token->attributes->{'http-equiv'}->[1];
 223         print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 224             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 225                 my $tag = uc($1);
 226                 my $type = (lc($tag) eq 'input'?
 227                         $token->attributes->{'type'}: undef);
 228                 my $name = $token->attributes->{'name'};
 229         printf $OUTPUT "#. %s\n", $tag
 230                     . (defined $type? " type=$type->[1]": '')
 231                     . (defined $name? " name=$name->[1]": '');
 232             }
 233         } elsif ($text{$t}->[0]->has_js_data) {
 234         printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 235         printf $OUTPUT "#. SCRIPT\n";
 236         }
 237         my $cformat_p;
 238         for my $token (@{$text{$t}}) {
 239             my $pathname = $token->pathname;
 240             $pathname =~ s/^$directory_re//os;
 241         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 242         printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 243                     if defined $pathname && defined $token->line_number;
 244             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 245         }
 246         printf $OUTPUT "#, c-format\n" if $cformat_p;
 247         printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 248                 TmplTokenizer::string_canon
 249                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 250         printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 251                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 252     }
 253 }
 254
 255 ###############################################################################
 256
 257 sub convert_translation_file {
 258     open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n";
 259     VerboseWarnings::set_input_file_name $convert_from;
 260     while (<$INPUT>) {
 261         chomp;
 262         my($msgid, $msgstr) = split(/\t/);
 263         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 264                 unless defined $msgstr;
 265
 266         # Fixup some of the bad strings
 267         $msgid =~ s/^SELECTED>//;
 268
 269         # Create dummy token
 270         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 271         remember( $token, $msgid );
 272         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 273         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 274
 275         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 276             my $candidate = TmplTokenizer::charset_canon $2;
 277             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 278                     if defined $charset_in && $charset_in ne $candidate;
 279             $charset_in = $candidate;
 280         }
 281         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 282             my $candidate = TmplTokenizer::charset_canon $2;
 283             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 284                     if defined $charset_out && $charset_out ne $candidate;
 285             $charset_out = $candidate;
 286         }
 287     }
 288     # The following assumption is correct; that's what HTML::Template assumes
 289     if (!defined $charset_in) {
 290         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 291         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 292     }
 293 }
 294
 295 ###############################################################################
 296
 297 sub usage {
 298     my($exitcode) = @_;
 299     my $h = $exitcode? *STDERR: *STDOUT;
 300     print $h <<EOF;
 301 Usage: $0 [OPTIONS]
 302 Extract translatable strings from given HTML::Template input files.
 303
 304 Input file location:
 305   -f, --files-from=FILE          Get list of input files from FILE
 306   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 307
 308 Output file location:
 309   -o, --output=FILE              Write output to specified file
 310
 311 HTML::Template options:
 312   -a, --extract-all              Extract all strings
 313       --pedantic-warnings        Issue warnings even for detected problems
 314                                  which are likely to be harmless
 315
 316 Output details:
 317   -s, --sort-output              generate sorted output
 318   -F, --sort-by-file             sort output by file location
 319   -v, --verbose                  explain what is being done
 320
 321 Informative output:
 322       --help                     Display this help and exit
 323
 324 Try `perldoc $0' for perhaps more information.
 325 EOF
 326     exit($exitcode);
 327 }
 328
 329 ###############################################################################
 330
 331 sub usage_error {
 332     print STDERR "$_[0]\n" if @_;
 333     print STDERR "Try `$0 --help' for more information.\n";
 334     exit(-1);
 335 }
 336
 337 ###############################################################################
 338
 339 Getopt::Long::config qw( bundling no_auto_abbrev );
 340 GetOptions(
 341     'a|extract-all'                     => \$extract_all_p,
 342     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 343     'convert-from=s'                    => \$convert_from,
 344     'D|directory=s'                     => \$directory,
 345     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 346     'f|files-from=s'                    => \$files_from,
 347     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 348     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 349     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 350     'output|o=s'                        => \$output,
 351     'po-mode'                           => \$po_mode_p,         # INTERNAL
 352     's|sort-output'                     => sub { $sort = 's' },
 353     'F|sort-by-file'                    => sub { $sort = 'F' },
 354     'v|verbose'                         => \$verbose_p,
 355     'help'                              => sub { usage(0) },
 356 ) || usage_error;
 357
 358 VerboseWarnings::set_application_name $0;
 359 VerboseWarnings::set_pedantic_mode $pedantic_p;
 360
 361 usage_error('Missing mandatory option -f')
 362         unless defined $files_from || defined $convert_from;
 363 $directory = '.' unless defined $directory;
 364
 365 usage_error('You cannot specify both --convert-from and --files-from')
 366         if defined $convert_from && defined $files_from;
 367
 368 if (defined $output && $output ne '-') {
 369     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 370     open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n";
 371 } else {
 372     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 373     open($OUTPUT, ">&STDOUT");
 374 }
 375
 376 if (defined $files_from) {
 377     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 378     open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n";
 379     while (<$INPUT>) {
 380         chomp;
 381         my $input = /^\//? $_: "$directory/$_";
 382         my $h = TmplTokenizer->new( $input );
 383         $h->set_allow_cformat( 1 );
 384         VerboseWarnings::set_input_file_name $input;
 385         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 386         text_extract( $h );
 387     }
 388     close $INPUT;
 389 } else {
 390     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 391     convert_translation_file;
 392 }
 393 generate_po_file;
 394
 395 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 396         if TmplTokenizer::syntaxerror_p;
 397
 398
 399 exit(-1) if TmplTokenizer::fatal_p;
 400
 401 ###############################################################################
 402
 403 =head1 DESCRIPTION
 404
 405 This script has behaviour similar to
 406 xgettext(1), and generates gettext-compatible output files.
 407
 408 A gettext-like format provides the following advantages:
 409
 410 =over
 411
 412 =item -
 413
 414 Translation to non-English-like languages with different word
 415 order:  gettext's c-format strings can theoretically be
 416 emulated if we are able to do some analysis on the .tt input
 417 and treat <TMPL_VAR> in a way similar to %s.
 418
 419 =item -
 420
 421 Context for the extracted strings:  the gettext format provides
 422 the filenames and line numbers where each string can be found.
 423 The translator can read the source file and see the context,
 424 in case the string by itself can mean several different things.
 425
 426 =item -
 427
 428 Place for the translator to add comments about the translations.
 429
 430 =item -
 431
 432 Gettext-compatible tools, if any, might be usable if we adopt
 433 the gettext format.
 434
 435 =back
 436
 437 This script has already been in use for over a year and should
 438 be reasonable stable. Nevertheless, it is still somewhat
 439 experimental and there are still some issues.
 440
 441 Please refer to the explanation in tmpl_process3 for further
 442 details.
 443
 444 If you want to generate GNOME-style POTFILES.in files, such
 445 files (passed to -f) can be generated thus:
 446
 447     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 448         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 449     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 450         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 451
 452 This is, however, quite pointless, because the "create" and
 453 "update" actions have already been implemented in tmpl_process3.pl.
 454
 455 =head2 Strings inside JavaScript
 456
 457 In the SCRIPT elements, the script will attempt to scan for
 458 _("I<string literal>") patterns, and extract the I<string literal>
 459 as a translatable string.
 460
 461 Note that the C-like _(...) notation is required.
 462
 463 The JavaScript must actually define a _ function
 464 so that the code remains correct JavaScript.
 465 A suitable definition of such a function can be
 466
 467         function _(s) { return s } // dummy function for gettext
 468
 469 =head1 SEE ALSO
 470
 471 tmpl_process3.pl,
 472 xgettext(1),
 473 Locale::PO(3),
 474 translator_doc.txt
 475
 476 =head1 BUGS
 477
 478 There probably are some. Bugs related to scanning of <INPUT>
 479 tags seem to be especially likely to be present.
 480
 481 Its diagnostics are probably too verbose.
 482
 483 When a <TMPL_VAR> within a JavaScript-related attribute is
 484 detected, the script currently displays no warnings at all.
 485 It might be good to display some kind of warning.
 486
 487 Its sort order (-s option) seems to be different than the real
 488 xgettext(1)'s sort option. This will result in translation
 489 strings inside the generated PO file spuriously moving about
 490 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 491
 492 If a Javascript string has leading spaces, it will
 493 generate strings with spurious leading spaces,
 494 leading to failure to match the strings when actually generating
 495 translated files.
 496
 497 =cut