misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
   6
   7 =cut
   8
   9 use FindBin;
  10 use lib $FindBin::Bin;
  11
  12 use strict;
  13 use warnings;
  14 use Getopt::Long;
  15 use POSIX;
  16 use Locale::PO;
  17 use TmplTokenizer;
  18 use VerboseWarnings;
  19
  20 use vars qw( $convert_from );
  21 use vars qw( $files_from $directory $output $sort );
  22 use vars qw( $extract_all_p );
  23 use vars qw( $pedantic_p );
  24 use vars qw( %text %translation );
  25 use vars qw( $charset_in $charset_out );
  26 use vars qw( $disable_fuzzy_p );
  27 use vars qw( $verbose_p );
  28 use vars qw( $po_mode_p );
  29
  30 our $OUTPUT;
  31
  32 ###############################################################################
  33
  34 sub string_negligible_p {
  35     my($t) = @_;                                # a string
  36     # Don't emit pure whitespace, pure numbers, pure punctuation,
  37     # single letters, or TMPL_VAR's.
  38     # Punctuation should arguably be translated. But without context
  39     # they are untranslatable. Note that $t is a string, not a token object.
  40     return !$extract_all_p && (
  41                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  42             || $t =~ /^\d+$/                    # purely digits
  43             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  44             || $t =~ /^[A-Za-z]$/               # single letters
  45             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  46         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  47         )
  48 }
  49
  50 sub token_negligible_p {
  51     my ($x) = @_;
  52     my $t = $x->type;
  53     return !$extract_all_p && (
  54           $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string )
  55         : $t == C4::TmplTokenType::DIRECTIVE() ? 1
  56         : $t == C4::TmplTokenType::TEXT_PARAMETRIZED()
  57         && join(
  58             '',
  59             map {
  60                 my $t = $_->type;
  61                     $t == C4::TmplTokenType::DIRECTIVE() ? '1'
  62                   : $t == C4::TmplTokenType::TAG()       ? ''
  63                   : token_negligible_p($_)               ? ''
  64                   : '1'
  65             } @{ $x->children }
  66         ) eq ''
  67     );
  68 }
  69
  70 ###############################################################################
  71
  72 sub remember {
  73     my($token, $string) = @_;
  74     # If we determine that the string is negligible, don't bother to remember
  75     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  76         my $key = TmplTokenizer::string_canon( $string );
  77         $text{$key} = [] unless defined $text{$key};
  78         push @{$text{$key}}, $token;
  79     }
  80 }
  81
  82 ###############################################################################
  83
  84 sub string_list {
  85     my @t = keys %text;
  86     # The real gettext tools seems to sort case sensitively; I don't know why
  87     @t = sort { $a cmp $b } @t if $sort eq 's';
  88     @t = sort {
  89             my @aa = sort { $a->pathname cmp $b->pathname
  90                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  91             my @bb = sort { $a->pathname cmp $b->pathname
  92                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  93             $aa[0]->pathname cmp $bb[0]->pathname
  94                     || $aa[0]->line_number <=> $bb[0]->line_number;
  95         } @t if $sort eq 'F';
  96     return @t;
  97 }
  98
  99   ###############################################################################
 100
 101 sub text_extract {
 102     my($h) = @_;
 103     for (;;) {
 104         my $s = TmplTokenizer::next_token $h;
 105         last unless defined $s;
 106         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 107         if ($kind eq C4::TmplTokenType::TEXT) {
 108             if ($t =~ /\S/s && $t !~ /<!/){
 109                 remember( $s, $t );
 110             }
 111         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 112             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 113                 remember( $s, $s->form );
 114             }
 115         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 116             # value [tag=input], meta
 117             my $tag;
 118             $tag = lc($1) if $t =~ /^<(\S+)/s;
 119             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
 120                 if ($attr->{$a}) {
 121                     next if $a eq 'label' && $tag ne 'optgroup';
 122                     next if $a eq 'content' && $tag ne 'meta';
 123                     next if $a eq 'value' && ($tag ne 'input'
 124                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 125                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 126                     $val = TmplTokenizer::trim $val;
 127                     remember( $s, $val ) if $val =~ /\S/s;
 128                 }
 129             }
 130         } elsif ($s->has_js_data) {
 131             for my $t (@{$s->js_data}) {
 132               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 133             }
 134         }
 135     }
 136 }
 137
 138 ###############################################################################
 139
 140 sub generate_strings_list {
 141     # Emit all extracted strings.
 142     for my $t (string_list) {
 143         printf $OUTPUT "%s\n", $t;
 144     }
 145 }
 146
 147 ###############################################################################
 148
 149 sub generate_po_file {
 150     # We don't emit the Plural-Forms header; it's meaningless for us
 151     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 152     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 153     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 154     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 155     my $time_pot = $time;
 156     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 157     print $OUTPUT <<EOF;
 158 # SOME DESCRIPTIVE TITLE.
 159 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 160 # This file is distributed under the same license as the PACKAGE package.
 161 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 162 #
 163 EOF
 164     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 165 #, fuzzy
 166 EOF
 167     print $OUTPUT <<EOF;
 168 msgid ""
 169 msgstr ""
 170 "Project-Id-Version: PACKAGE VERSION\\n"
 171 "POT-Creation-Date: $time_pot\\n"
 172 "PO-Revision-Date: $time_po\\n"
 173 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 174 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 175 "MIME-Version: 1.0\\n"
 176 "Content-Type: text/plain; charset=$pot_charset\\n"
 177 "Content-Transfer-Encoding: 8bit\\n"
 178
 179 EOF
 180     my $directory_re = quotemeta("$directory/");
 181     for my $t (string_list) {
 182         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 183             my($token, $n) = ($text{$t}->[0], 0);
 184         printf $OUTPUT "#. For the first occurrence,\n"
 185                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 186             for my $param ($token->parameters_and_fields) {
 187                 $n += 1;
 188                 my $type = $param->type;
 189                 my $subtype = ($type == C4::TmplTokenType::TAG
 190                         && $param->string =~ /^<input\b/is?
 191                                 $param->attributes->{'type'}->[1]: undef);
 192                 my $fmt = TmplTokenizer::_formalize( $param );
 193                 $fmt =~ s/^%/%$n\$/;
 194                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 195 #                   $type = "Template::Toolkit Directive";
 196                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 197                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 198                             $2: undef;
 199             printf $OUTPUT "#. %s: %s\n", $fmt,
 200                         "$type" . (defined $name? " name=$name": '');
 201                 } else {
 202                     my $name = $param->attributes->{'name'};
 203             my $value;
 204             $value = $param->attributes->{'value'}
 205                             unless $subtype =~ /^(?:text)$/;
 206             printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 207                             . (defined $name?  " name=$name->[1]": '')
 208                             . (defined $value? " value=$value->[1]": '');
 209                 }
 210             }
 211         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 212             my($token) = ($text{$t}->[0]);
 213         printf $OUTPUT "#. For the first occurrence,\n"
 214                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 215             if ($token->string =~ /^<meta\b/is) {
 216                 my $type = $token->attributes->{'http-equiv'}->[1];
 217         print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 218             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 219                 my $tag = uc($1);
 220                 my $type = (lc($tag) eq 'input'?
 221                         $token->attributes->{'type'}: undef);
 222                 my $name = $token->attributes->{'name'};
 223         printf $OUTPUT "#. %s\n", $tag
 224                     . (defined $type? " type=$type->[1]": '')
 225                     . (defined $name? " name=$name->[1]": '');
 226             }
 227         } elsif ($text{$t}->[0]->has_js_data) {
 228         printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 229         printf $OUTPUT "#. SCRIPT\n";
 230         }
 231         my $cformat_p;
 232         for my $token (@{$text{$t}}) {
 233             my $pathname = $token->pathname;
 234             $pathname =~ s/^$directory_re//os;
 235         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 236         printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 237                     if defined $pathname && defined $token->line_number;
 238             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 239         }
 240         printf $OUTPUT "#, c-format\n" if $cformat_p;
 241         printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 242                 TmplTokenizer::string_canon
 243                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 244         printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 245                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 246     }
 247 }
 248
 249 ###############################################################################
 250
 251 sub convert_translation_file {
 252     open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
 253     VerboseWarnings::set_input_file_name $convert_from;
 254     while (<$INPUT>) {
 255         chomp;
 256         my($msgid, $msgstr) = split(/\t/);
 257         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 258                 unless defined $msgstr;
 259
 260         # Fixup some of the bad strings
 261         $msgid =~ s/^SELECTED>//;
 262
 263         # Create dummy token
 264         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 265         remember( $token, $msgid );
 266         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 267         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 268
 269         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 270             my $candidate = TmplTokenizer::charset_canon $2;
 271             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 272                     if defined $charset_in && $charset_in ne $candidate;
 273             $charset_in = $candidate;
 274         }
 275         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 276             my $candidate = TmplTokenizer::charset_canon $2;
 277             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 278                     if defined $charset_out && $charset_out ne $candidate;
 279             $charset_out = $candidate;
 280         }
 281     }
 282     # The following assumption is correct; that's what HTML::Template assumes
 283     if (!defined $charset_in) {
 284         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 285         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 286     }
 287 }
 288
 289 ###############################################################################
 290
 291 sub usage {
 292     my($exitcode) = @_;
 293     my $h = $exitcode? *STDERR: *STDOUT;
 294     print $h <<EOF;
 295 Usage: $0 [OPTIONS]
 296 Extract translatable strings from given HTML::Template input files.
 297
 298 Input file location:
 299   -f, --files-from=FILE          Get list of input files from FILE
 300   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 301
 302 Output file location:
 303   -o, --output=FILE              Write output to specified file
 304
 305 HTML::Template options:
 306   -a, --extract-all              Extract all strings
 307       --pedantic-warnings        Issue warnings even for detected problems
 308                                  which are likely to be harmless
 309
 310 Output details:
 311   -s, --sort-output              generate sorted output
 312   -F, --sort-by-file             sort output by file location
 313   -v, --verbose                  explain what is being done
 314
 315 Informative output:
 316       --help                     Display this help and exit
 317
 318 Try `perldoc $0' for perhaps more information.
 319 EOF
 320     exit($exitcode);
 321 }
 322
 323 ###############################################################################
 324
 325 sub usage_error {
 326     print STDERR "$_[0]\n" if @_;
 327     print STDERR "Try `$0 --help' for more information.\n";
 328     exit(-1);
 329 }
 330
 331 ###############################################################################
 332
 333 Getopt::Long::config qw( bundling no_auto_abbrev );
 334 GetOptions(
 335     'a|extract-all'                     => \$extract_all_p,
 336     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 337     'convert-from=s'                    => \$convert_from,
 338     'D|directory=s'                     => \$directory,
 339     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 340     'f|files-from=s'                    => \$files_from,
 341     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 342     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 343     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 344     'output|o=s'                        => \$output,
 345     'po-mode'                           => \$po_mode_p,         # INTERNAL
 346     's|sort-output'                     => sub { $sort = 's' },
 347     'F|sort-by-file'                    => sub { $sort = 'F' },
 348     'v|verbose'                         => \$verbose_p,
 349     'help'                              => sub { usage(0) },
 350 ) || usage_error;
 351
 352 VerboseWarnings::set_application_name $0;
 353 VerboseWarnings::set_pedantic_mode $pedantic_p;
 354
 355 usage_error('Missing mandatory option -f')
 356         unless defined $files_from || defined $convert_from;
 357 $directory = '.' unless defined $directory;
 358
 359 usage_error('You cannot specify both --convert-from and --files-from')
 360         if defined $convert_from && defined $files_from;
 361
 362 if (defined $output && $output ne '-') {
 363     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 364         open($OUTPUT, '>', $output) || die "$output: $!\n";
 365 } else {
 366     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 367     open($OUTPUT, ">&STDOUT");
 368 }
 369
 370 if (defined $files_from) {
 371     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 372     open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
 373     while (<$INPUT>) {
 374         chomp;
 375         my $input = /^\//? $_: "$directory/$_";
 376         my $h = TmplTokenizer->new( $input );
 377         $h->set_allow_cformat( 1 );
 378         VerboseWarnings::set_input_file_name $input;
 379         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 380         text_extract( $h );
 381     }
 382     close $INPUT;
 383 } else {
 384     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 385     convert_translation_file;
 386 }
 387 generate_po_file;
 388
 389 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 390         if TmplTokenizer::syntaxerror_p;
 391
 392
 393 exit(-1) if TmplTokenizer::fatal_p;
 394
 395 ###############################################################################
 396
 397 =head1 DESCRIPTION
 398
 399 This script has behaviour similar to
 400 xgettext(1), and generates gettext-compatible output files.
 401
 402 A gettext-like format provides the following advantages:
 403
 404 =over
 405
 406 =item -
 407
 408 Translation to non-English-like languages with different word
 409 order:  gettext's c-format strings can theoretically be
 410 emulated if we are able to do some analysis on the .tt input
 411 and treat <TMPL_VAR> in a way similar to %s.
 412
 413 =item -
 414
 415 Context for the extracted strings:  the gettext format provides
 416 the filenames and line numbers where each string can be found.
 417 The translator can read the source file and see the context,
 418 in case the string by itself can mean several different things.
 419
 420 =item -
 421
 422 Place for the translator to add comments about the translations.
 423
 424 =item -
 425
 426 Gettext-compatible tools, if any, might be usable if we adopt
 427 the gettext format.
 428
 429 =back
 430
 431 This script has already been in use for over a year and should
 432 be reasonable stable. Nevertheless, it is still somewhat
 433 experimental and there are still some issues.
 434
 435 Please refer to the explanation in tmpl_process3 for further
 436 details.
 437
 438 If you want to generate GNOME-style POTFILES.in files, such
 439 files (passed to -f) can be generated thus:
 440
 441     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 442         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 443     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 444         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 445
 446 This is, however, quite pointless, because the "create" and
 447 "update" actions have already been implemented in tmpl_process3.pl.
 448
 449 =head2 Strings inside JavaScript
 450
 451 In the SCRIPT elements, the script will attempt to scan for
 452 _("I<string literal>") patterns, and extract the I<string literal>
 453 as a translatable string.
 454
 455 Note that the C-like _(...) notation is required.
 456
 457 The JavaScript must actually define a _ function
 458 so that the code remains correct JavaScript.
 459 A suitable definition of such a function can be
 460
 461         function _(s) { return s } // dummy function for gettext
 462
 463 =head1 SEE ALSO
 464
 465 tmpl_process3.pl,
 466 xgettext(1),
 467 Locale::PO(3),
 468 translator_doc.txt
 469
 470 =head1 BUGS
 471
 472 There probably are some. Bugs related to scanning of <INPUT>
 473 tags seem to be especially likely to be present.
 474
 475 Its diagnostics are probably too verbose.
 476
 477 When a <TMPL_VAR> within a JavaScript-related attribute is
 478 detected, the script currently displays no warnings at all.
 479 It might be good to display some kind of warning.
 480
 481 Its sort order (-s option) seems to be different than the real
 482 xgettext(1)'s sort option. This will result in translation
 483 strings inside the generated PO file spuriously moving about
 484 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 485
 486 If a Javascript string has leading spaces, it will
 487 generate strings with spurious leading spaces,
 488 leading to failure to match the strings when actually generating
 489 translated files.
 490
 491 =cut