misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use warnings;
  11 use Getopt::Long;
  12 use POSIX;
  13 use Locale::PO;
  14 use TmplTokenizer;
  15 use VerboseWarnings;
  16
  17 use vars qw( $convert_from );
  18 use vars qw( $files_from $directory $output $sort );
  19 use vars qw( $extract_all_p );
  20 use vars qw( $pedantic_p );
  21 use vars qw( %text %translation );
  22 use vars qw( $charset_in $charset_out );
  23 use vars qw( $disable_fuzzy_p );
  24 use vars qw( $verbose_p );
  25 use vars qw( $po_mode_p );
  26
  27 our $OUTPUT;
  28
  29 ###############################################################################
  30
  31 sub string_negligible_p {
  32     my($t) = @_;                                # a string
  33     # Don't emit pure whitespace, pure numbers, pure punctuation,
  34     # single letters, or TMPL_VAR's.
  35     # Punctuation should arguably be translated. But without context
  36     # they are untranslatable. Note that $t is a string, not a token object.
  37     return !$extract_all_p && (
  38                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  39             || $t =~ /^\d+$/                    # purely digits
  40             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  41             || $t =~ /^[A-Za-z]$/               # single letters
  42             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  43         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  44         )
  45 }
  46
  47 sub token_negligible_p {
  48     my ($x) = @_;
  49     my $t = $x->type;
  50     return !$extract_all_p && (
  51           $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
  52         : $t == C4::TmplTokenType::DIRECTIVE() ? 1
  53         : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
  54         && join(
  55             '',
  56             map {
  57                 my $t = $_->type;
  58                     $t == C4::TmplTokenType::DIRECTIVE() ? '1'
  59                   : $t == C4::TmplTokenType::TAG()       ? ''
  60                   : token_negligible_p($_)               ? ''
  61                   : '1'
  62             } @{ $x->children }
  63         ) eq ''
  64     );
  65 }
  66
  67 ###############################################################################
  68
  69 sub remember {
  70     my($token, $string) = @_;
  71     # If we determine that the string is negligible, don't bother to remember
  72     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  73         my $key = TmplTokenizer::string_canon( $string );
  74         $text{$key} = [] unless defined $text{$key};
  75         push @{$text{$key}}, $token;
  76     }
  77 }
  78
  79 ###############################################################################
  80
  81 sub string_list {
  82     my @t = keys %text;
  83     # The real gettext tools seems to sort case sensitively; I don't know why
  84     @t = sort { $a cmp $b } @t if $sort eq 's';
  85     @t = sort {
  86             my @aa = sort { $a->pathname cmp $b->pathname
  87                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  88             my @bb = sort { $a->pathname cmp $b->pathname
  89                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  90             $aa[0]->pathname cmp $bb[0]->pathname
  91                     || $aa[0]->line_number <=> $bb[0]->line_number;
  92         } @t if $sort eq 'F';
  93     return @t;
  94 }
  95
  96   ###############################################################################
  97
  98 sub text_extract {
  99     my($h) = @_;
 100     for (;;) {
 101         my $s = TmplTokenizer::next_token $h;
 102         last unless defined $s;
 103         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 104         if ($kind eq C4::TmplTokenType::TEXT) {
 105             if ($t =~ /\S/s && $t !~ /<!/){
 106                 remember( $s, $t );
 107             }
 108         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 109             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 110                 remember( $s, $s->form );
 111             }
 112         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 113             # value [tag=input], meta
 114             my $tag;
 115             $tag = lc($1) if $t =~ /^<(\S+)/s;
 116             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
 117                 if ($attr->{$a}) {
 118                     next if $a eq 'label' && $tag ne 'optgroup';
 119                     next if $a eq 'content' && $tag ne 'meta';
 120                     next if $a eq 'value' && ($tag ne 'input'
 121                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 122                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 123                     $val = TmplTokenizer::trim $val;
 124                     remember( $s, $val ) if $val =~ /\S/s;
 125                 }
 126             }
 127         } elsif ($s->has_js_data) {
 128             for my $t (@{$s->js_data}) {
 129               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 130             }
 131         }
 132     }
 133 }
 134
 135 ###############################################################################
 136
 137 sub generate_strings_list {
 138     # Emit all extracted strings.
 139     for my $t (string_list) {
 140         printf $OUTPUT "%s\n", $t;
 141     }
 142 }
 143
 144 ###############################################################################
 145
 146 sub generate_po_file {
 147     # We don't emit the Plural-Forms header; it's meaningless for us
 148     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 149     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 150     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 151     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 152     my $time_pot = $time;
 153     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 154     print $OUTPUT <<EOF;
 155 # SOME DESCRIPTIVE TITLE.
 156 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 157 # This file is distributed under the same license as the PACKAGE package.
 158 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 159 #
 160 EOF
 161     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 162 #, fuzzy
 163 EOF
 164     print $OUTPUT <<EOF;
 165 msgid ""
 166 msgstr ""
 167 "Project-Id-Version: PACKAGE VERSION\\n"
 168 "POT-Creation-Date: $time_pot\\n"
 169 "PO-Revision-Date: $time_po\\n"
 170 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 171 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 172 "MIME-Version: 1.0\\n"
 173 "Content-Type: text/plain; charset=$pot_charset\\n"
 174 "Content-Transfer-Encoding: 8bit\\n"
 175
 176 EOF
 177     my $directory_re = quotemeta("$directory/");
 178     for my $t (string_list) {
 179         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 180             my($token, $n) = ($text{$t}->[0], 0);
 181         printf $OUTPUT "#. For the first occurrence,\n"
 182                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 183             for my $param ($token->parameters_and_fields) {
 184                 $n += 1;
 185                 my $type = $param->type;
 186                 my $subtype = ($type == C4::TmplTokenType::TAG
 187                         && $param->string =~ /^<input\b/is?
 188                                 $param->attributes->{'type'}->[1]: undef);
 189                 my $fmt = TmplTokenizer::_formalize( $param );
 190                 $fmt =~ s/^%/%$n\$/;
 191                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 192 #                   $type = "Template::Toolkit Directive";
 193                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 194                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 195                             $2: undef;
 196             printf $OUTPUT "#. %s: %s\n", $fmt,
 197                         "$type" . (defined $name? " name=$name": '');
 198                 } else {
 199                     my $name = $param->attributes->{'name'};
 200             my $value;
 201             $value = $param->attributes->{'value'}
 202                             unless $subtype =~ /^(?:text)$/;
 203             printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 204                             . (defined $name?  " name=$name->[1]": '')
 205                             . (defined $value? " value=$value->[1]": '');
 206                 }
 207             }
 208         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 209             my($token) = ($text{$t}->[0]);
 210         printf $OUTPUT "#. For the first occurrence,\n"
 211                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 212             if ($token->string =~ /^<meta\b/is) {
 213                 my $type = $token->attributes->{'http-equiv'}->[1];
 214         print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 215             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 216                 my $tag = uc($1);
 217                 my $type = (lc($tag) eq 'input'?
 218                         $token->attributes->{'type'}: undef);
 219                 my $name = $token->attributes->{'name'};
 220         printf $OUTPUT "#. %s\n", $tag
 221                     . (defined $type? " type=$type->[1]": '')
 222                     . (defined $name? " name=$name->[1]": '');
 223             }
 224         } elsif ($text{$t}->[0]->has_js_data) {
 225         printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 226         printf $OUTPUT "#. SCRIPT\n";
 227         }
 228         my $cformat_p;
 229         for my $token (@{$text{$t}}) {
 230             my $pathname = $token->pathname;
 231             $pathname =~ s/^$directory_re//os;
 232         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 233         printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 234                     if defined $pathname && defined $token->line_number;
 235             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 236         }
 237         printf $OUTPUT "#, c-format\n" if $cformat_p;
 238         printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 239                 TmplTokenizer::string_canon
 240                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 241         printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 242                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 243     }
 244 }
 245
 246 ###############################################################################
 247
 248 sub convert_translation_file {
 249     open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
 250     VerboseWarnings::set_input_file_name $convert_from;
 251     while (<$INPUT>) {
 252         chomp;
 253         my($msgid, $msgstr) = split(/\t/);
 254         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 255                 unless defined $msgstr;
 256
 257         # Fixup some of the bad strings
 258         $msgid =~ s/^SELECTED>//;
 259
 260         # Create dummy token
 261         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 262         remember( $token, $msgid );
 263         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 264         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 265
 266         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 267             my $candidate = TmplTokenizer::charset_canon $2;
 268             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 269                     if defined $charset_in && $charset_in ne $candidate;
 270             $charset_in = $candidate;
 271         }
 272         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 273             my $candidate = TmplTokenizer::charset_canon $2;
 274             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 275                     if defined $charset_out && $charset_out ne $candidate;
 276             $charset_out = $candidate;
 277         }
 278     }
 279     # The following assumption is correct; that's what HTML::Template assumes
 280     if (!defined $charset_in) {
 281         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 282         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 283     }
 284 }
 285
 286 ###############################################################################
 287
 288 sub usage {
 289     my($exitcode) = @_;
 290     my $h = $exitcode? *STDERR: *STDOUT;
 291     print $h <<EOF;
 292 Usage: $0 [OPTIONS]
 293 Extract translatable strings from given HTML::Template input files.
 294
 295 Input file location:
 296   -f, --files-from=FILE          Get list of input files from FILE
 297   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 298
 299 Output file location:
 300   -o, --output=FILE              Write output to specified file
 301
 302 HTML::Template options:
 303   -a, --extract-all              Extract all strings
 304       --pedantic-warnings        Issue warnings even for detected problems
 305                                  which are likely to be harmless
 306
 307 Output details:
 308   -s, --sort-output              generate sorted output
 309   -F, --sort-by-file             sort output by file location
 310   -v, --verbose                  explain what is being done
 311
 312 Informative output:
 313       --help                     Display this help and exit
 314
 315 Try `perldoc $0' for perhaps more information.
 316 EOF
 317     exit($exitcode);
 318 }
 319
 320 ###############################################################################
 321
 322 sub usage_error {
 323     print STDERR "$_[0]\n" if @_;
 324     print STDERR "Try `$0 --help' for more information.\n";
 325     exit(-1);
 326 }
 327
 328 ###############################################################################
 329
 330 Getopt::Long::config qw( bundling no_auto_abbrev );
 331 GetOptions(
 332     'a|extract-all'                     => \$extract_all_p,
 333     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 334     'convert-from=s'                    => \$convert_from,
 335     'D|directory=s'                     => \$directory,
 336     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 337     'f|files-from=s'                    => \$files_from,
 338     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 339     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 340     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 341     'output|o=s'                        => \$output,
 342     'po-mode'                           => \$po_mode_p,         # INTERNAL
 343     's|sort-output'                     => sub { $sort = 's' },
 344     'F|sort-by-file'                    => sub { $sort = 'F' },
 345     'v|verbose'                         => \$verbose_p,
 346     'help'                              => sub { usage(0) },
 347 ) || usage_error;
 348
 349 VerboseWarnings::set_application_name $0;
 350 VerboseWarnings::set_pedantic_mode $pedantic_p;
 351
 352 usage_error('Missing mandatory option -f')
 353         unless defined $files_from || defined $convert_from;
 354 $directory = '.' unless defined $directory;
 355
 356 usage_error('You cannot specify both --convert-from and --files-from')
 357         if defined $convert_from && defined $files_from;
 358
 359 if (defined $output && $output ne '-') {
 360     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 361         open($OUTPUT, '>', $output) || die "$output: $!\n";
 362 } else {
 363     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 364     open($OUTPUT, ">&STDOUT");
 365 }
 366
 367 if (defined $files_from) {
 368     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 369     open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
 370     while (<$INPUT>) {
 371         chomp;
 372         my $input = /^\//? $_: "$directory/$_";
 373         my $h = TmplTokenizer->new( $input );
 374         $h->set_allow_cformat( 1 );
 375         VerboseWarnings::set_input_file_name $input;
 376         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 377         text_extract( $h );
 378     }
 379     close $INPUT;
 380 } else {
 381     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 382     convert_translation_file;
 383 }
 384 generate_po_file;
 385
 386 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 387         if TmplTokenizer::syntaxerror_p;
 388
 389
 390 exit(-1) if TmplTokenizer::fatal_p;
 391
 392 ###############################################################################
 393
 394 =head1 DESCRIPTION
 395
 396 This is an experimental script based on the modularized
 397 text-extract2.pl script.  It has behaviour similar to
 398 xgettext(1), and generates gettext-compatible output files.
 399
 400 A gettext-like format provides the following advantages:
 401
 402 =over
 403
 404 =item -
 405
 406 Translation to non-English-like languages with different word
 407 order:  gettext's c-format strings can theoretically be
 408 emulated if we are able to do some analysis on the .tt input
 409 and treat <TMPL_VAR> in a way similar to %s.
 410
 411 =item -
 412
 413 Context for the extracted strings:  the gettext format provides
 414 the filenames and line numbers where each string can be found.
 415 The translator can read the source file and see the context,
 416 in case the string by itself can mean several different things.
 417
 418 =item -
 419
 420 Place for the translator to add comments about the translations.
 421
 422 =item -
 423
 424 Gettext-compatible tools, if any, might be usable if we adopt
 425 the gettext format.
 426
 427 =back
 428
 429 This script has already been in use for over a year and should
 430 be reasonable stable. Nevertheless, it is still somewhat
 431 experimental and there are still some issues.
 432
 433 Please refer to the explanation in tmpl_process3 for further
 434 details.
 435
 436 If you want to generate GNOME-style POTFILES.in files, such
 437 files (passed to -f) can be generated thus:
 438
 439     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 440         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 441     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 442         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 443
 444 This is, however, quite pointless, because the "create" and
 445 "update" actions have already been implemented in tmpl_process3.pl.
 446
 447 =head2 Strings inside JavaScript
 448
 449 In the SCRIPT elements, the script will attempt to scan for
 450 _("I<string literal>") patterns, and extract the I<string literal>
 451 as a translatable string.
 452
 453 Note that the C-like _(...) notation is required.
 454
 455 The JavaScript must actually define a _ function
 456 so that the code remains correct JavaScript.
 457 A suitable definition of such a function can be
 458
 459         function _(s) { return s } // dummy function for gettext
 460
 461 =head1 SEE ALSO
 462
 463 tmpl_process3.pl,
 464 xgettext(1),
 465 Locale::PO(3),
 466 translator_doc.txt
 467
 468 =head1 BUGS
 469
 470 There probably are some. Bugs related to scanning of <INPUT>
 471 tags seem to be especially likely to be present.
 472
 473 Its diagnostics are probably too verbose.
 474
 475 When a <TMPL_VAR> within a JavaScript-related attribute is
 476 detected, the script currently displays no warnings at all.
 477 It might be good to display some kind of warning.
 478
 479 Its sort order (-s option) seems to be different than the real
 480 xgettext(1)'s sort option. This will result in translation
 481 strings inside the generated PO file spuriously moving about
 482 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 483
 484 If a Javascript string has leading spaces, it will
 485 generate strings with spurious leading spaces,
 486 leading to failure to match the strings when actually generating
 487 translated files.
 488
 489 =cut