misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tt strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use warnings;
  11 use Getopt::Long;
  12 use POSIX;
  13 use Locale::PO;
  14 use TmplTokenizer;
  15 use VerboseWarnings;
  16
  17 use vars qw( $convert_from );
  18 use vars qw( $files_from $directory $output $sort );
  19 use vars qw( $extract_all_p );
  20 use vars qw( $pedantic_p );
  21 use vars qw( %text %translation );
  22 use vars qw( $charset_in $charset_out );
  23 use vars qw( $disable_fuzzy_p );
  24 use vars qw( $verbose_p );
  25 use vars qw( $po_mode_p );
  26
  27 our $OUTPUT;
  28
  29 ###############################################################################
  30
  31 sub string_negligible_p {
  32     my($t) = @_;                                # a string
  33     # Don't emit pure whitespace, pure numbers, pure punctuation,
  34     # single letters, or TMPL_VAR's.
  35     # Punctuation should arguably be translated. But without context
  36     # they are untranslatable. Note that $t is a string, not a token object.
  37     return !$extract_all_p && (
  38                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  39             || $t =~ /^\d+$/                    # purely digits
  40             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  41             || $t =~ /^[A-Za-z]$/               # single letters
  42             || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ...
  43         || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ )    # pure TT entities
  44         )
  45 }
  46
  47 sub token_negligible_p {
  48     my($x) = @_;
  49     my $t = $x->type;
  50     return !$extract_all_p && (
  51             $t == C4::TmplTokenType::TEXT? string_negligible_p( $x->string ):
  52             $t == C4::TmplTokenType::DIRECTIVE? 1:
  53             $t == C4::TmplTokenType::TEXT_PARAMETRIZED
  54                 && join( '', map { my $t = $_->type;
  55                         $t == C4::TmplTokenType::DIRECTIVE?
  56                                 '1': $t == C4::TmplTokenType::TAG?
  57                                         '': token_negligible_p( $_ )?
  58                                         '': '1' } @{$x->children} ) eq '' );
  59 }
  60
  61 ###############################################################################
  62
  63 sub remember {
  64     my($token, $string) = @_;
  65     # If we determine that the string is negligible, don't bother to remember
  66     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  67         my $key = TmplTokenizer::string_canon( $string );
  68         $text{$key} = [] unless defined $text{$key};
  69         push @{$text{$key}}, $token;
  70     }
  71 }
  72
  73 ###############################################################################
  74
  75 sub string_list {
  76     my @t = keys %text;
  77     # The real gettext tools seems to sort case sensitively; I don't know why
  78     @t = sort { $a cmp $b } @t if $sort eq 's';
  79     @t = sort {
  80             my @aa = sort { $a->pathname cmp $b->pathname
  81                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  82             my @bb = sort { $a->pathname cmp $b->pathname
  83                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  84             $aa[0]->pathname cmp $bb[0]->pathname
  85                     || $aa[0]->line_number <=> $bb[0]->line_number;
  86         } @t if $sort eq 'F';
  87     return @t;
  88 }
  89
  90   ###############################################################################
  91
  92 sub text_extract {
  93     my($h) = @_;
  94     for (;;) {
  95         my $s = TmplTokenizer::next_token $h;
  96         last unless defined $s;
  97         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  98         if ($kind eq C4::TmplTokenType::TEXT) {
  99             if ($t =~ /\S/s && $t !~ /<!/){
 100                 remember( $s, $t );
 101             }
 102         } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
 103             if ($s->form =~ /\S/s && $s->form !~ /<!/){
 104                 remember( $s, $s->form );
 105             }
 106         } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
 107             # value [tag=input], meta
 108             my $tag;
 109             $tag = lc($1) if $t =~ /^<(\S+)/s;
 110             for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
 111                 if ($attr->{$a}) {
 112                     next if $a eq 'label' && $tag ne 'optgroup';
 113                     next if $a eq 'content' && $tag ne 'meta';
 114                     next if $a eq 'value' && ($tag ne 'input'
 115                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 116                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 117                     $val = TmplTokenizer::trim $val;
 118                     remember( $s, $val ) if $val =~ /\S/s;
 119                 }
 120             }
 121         } elsif ($s->has_js_data) {
 122             for my $t (@{$s->js_data}) {
 123               remember( $s, $t->[3] ) if $t->[0]; # FIXME
 124             }
 125         }
 126     }
 127 }
 128
 129 ###############################################################################
 130
 131 sub generate_strings_list {
 132     # Emit all extracted strings.
 133     for my $t (string_list) {
 134         printf $OUTPUT "%s\n", $t;
 135     }
 136 }
 137
 138 ###############################################################################
 139
 140 sub generate_po_file {
 141     # We don't emit the Plural-Forms header; it's meaningless for us
 142     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 143     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 144     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 145     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 146     my $time_pot = $time;
 147     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 148     print $OUTPUT <<EOF;
 149 # SOME DESCRIPTIVE TITLE.
 150 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 151 # This file is distributed under the same license as the PACKAGE package.
 152 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 153 #
 154 EOF
 155     print $OUTPUT <<EOF unless $disable_fuzzy_p;
 156 #, fuzzy
 157 EOF
 158     print $OUTPUT <<EOF;
 159 msgid ""
 160 msgstr ""
 161 "Project-Id-Version: PACKAGE VERSION\\n"
 162 "POT-Creation-Date: $time_pot\\n"
 163 "PO-Revision-Date: $time_po\\n"
 164 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 165 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 166 "MIME-Version: 1.0\\n"
 167 "Content-Type: text/plain; charset=$pot_charset\\n"
 168 "Content-Transfer-Encoding: 8bit\\n"
 169
 170 EOF
 171     my $directory_re = quotemeta("$directory/");
 172     for my $t (string_list) {
 173         if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) {
 174             my($token, $n) = ($text{$t}->[0], 0);
 175         printf $OUTPUT "#. For the first occurrence,\n"
 176                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 177             for my $param ($token->parameters_and_fields) {
 178                 $n += 1;
 179                 my $type = $param->type;
 180                 my $subtype = ($type == C4::TmplTokenType::TAG
 181                         && $param->string =~ /^<input\b/is?
 182                                 $param->attributes->{'type'}->[1]: undef);
 183                 my $fmt = TmplTokenizer::_formalize( $param );
 184                 $fmt =~ s/^%/%$n\$/;
 185                 if ($type == C4::TmplTokenType::DIRECTIVE) {
 186 #                   $type = "Template::Toolkit Directive";
 187                     $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR';
 188                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 189                             $2: undef;
 190             printf $OUTPUT "#. %s: %s\n", $fmt,
 191                         "$type" . (defined $name? " name=$name": '');
 192                 } else {
 193                     my $name = $param->attributes->{'name'};
 194             my $value;
 195             $value = $param->attributes->{'value'}
 196                             unless $subtype =~ /^(?:text)$/;
 197             printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 198                             . (defined $name?  " name=$name->[1]": '')
 199                             . (defined $value? " value=$value->[1]": '');
 200                 }
 201             }
 202         } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) {
 203             my($token) = ($text{$t}->[0]);
 204         printf $OUTPUT "#. For the first occurrence,\n"
 205                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 206             if ($token->string =~ /^<meta\b/is) {
 207                 my $type = $token->attributes->{'http-equiv'}->[1];
 208         print $OUTPUT "#. META http-equiv=$type\n" if defined $type;
 209             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 210                 my $tag = uc($1);
 211                 my $type = (lc($tag) eq 'input'?
 212                         $token->attributes->{'type'}: undef);
 213                 my $name = $token->attributes->{'name'};
 214         printf $OUTPUT "#. %s\n", $tag
 215                     . (defined $type? " type=$type->[1]": '')
 216                     . (defined $name? " name=$name->[1]": '');
 217             }
 218         } elsif ($text{$t}->[0]->has_js_data) {
 219         printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 220         printf $OUTPUT "#. SCRIPT\n";
 221         }
 222         my $cformat_p;
 223         for my $token (@{$text{$t}}) {
 224             my $pathname = $token->pathname;
 225             $pathname =~ s/^$directory_re//os;
 226         $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
 227         printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 228                     if defined $pathname && defined $token->line_number;
 229             $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED;
 230         }
 231         printf $OUTPUT "#, c-format\n" if $cformat_p;
 232         printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 233                 TmplTokenizer::string_canon
 234                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 235         printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 236                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 237     }
 238 }
 239
 240 ###############################################################################
 241
 242 sub convert_translation_file {
 243     open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n";
 244     VerboseWarnings::set_input_file_name $convert_from;
 245     while (<$INPUT>) {
 246         chomp;
 247         my($msgid, $msgstr) = split(/\t/);
 248         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 249                 unless defined $msgstr;
 250
 251         # Fixup some of the bad strings
 252         $msgid =~ s/^SELECTED>//;
 253
 254         # Create dummy token
 255         my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef );
 256         remember( $token, $msgid );
 257         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 258         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 259
 260         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 261             my $candidate = TmplTokenizer::charset_canon $2;
 262             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 263                     if defined $charset_in && $charset_in ne $candidate;
 264             $charset_in = $candidate;
 265         }
 266         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 267             my $candidate = TmplTokenizer::charset_canon $2;
 268             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 269                     if defined $charset_out && $charset_out ne $candidate;
 270             $charset_out = $candidate;
 271         }
 272     }
 273     # The following assumption is correct; that's what HTML::Template assumes
 274     if (!defined $charset_in) {
 275         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 276         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 277     }
 278 }
 279
 280 ###############################################################################
 281
 282 sub usage {
 283     my($exitcode) = @_;
 284     my $h = $exitcode? *STDERR: *STDOUT;
 285     print $h <<EOF;
 286 Usage: $0 [OPTIONS]
 287 Extract translatable strings from given HTML::Template input files.
 288
 289 Input file location:
 290   -f, --files-from=FILE          Get list of input files from FILE
 291   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 292
 293 Output file location:
 294   -o, --output=FILE              Write output to specified file
 295
 296 HTML::Template options:
 297   -a, --extract-all              Extract all strings
 298       --pedantic-warnings        Issue warnings even for detected problems
 299                                  which are likely to be harmless
 300
 301 Output details:
 302   -s, --sort-output              generate sorted output
 303   -F, --sort-by-file             sort output by file location
 304   -v, --verbose                  explain what is being done
 305
 306 Informative output:
 307       --help                     Display this help and exit
 308
 309 Try `perldoc $0' for perhaps more information.
 310 EOF
 311     exit($exitcode);
 312 }
 313
 314 ###############################################################################
 315
 316 sub usage_error {
 317     print STDERR "$_[0]\n" if @_;
 318     print STDERR "Try `$0 --help' for more information.\n";
 319     exit(-1);
 320 }
 321
 322 ###############################################################################
 323
 324 Getopt::Long::config qw( bundling no_auto_abbrev );
 325 GetOptions(
 326     'a|extract-all'                     => \$extract_all_p,
 327     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 328     'convert-from=s'                    => \$convert_from,
 329     'D|directory=s'                     => \$directory,
 330     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 331     'f|files-from=s'                    => \$files_from,
 332     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 333     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 334     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 335     'output|o=s'                        => \$output,
 336     'po-mode'                           => \$po_mode_p,         # INTERNAL
 337     's|sort-output'                     => sub { $sort = 's' },
 338     'F|sort-by-file'                    => sub { $sort = 'F' },
 339     'v|verbose'                         => \$verbose_p,
 340     'help'                              => sub { usage(0) },
 341 ) || usage_error;
 342
 343 VerboseWarnings::set_application_name $0;
 344 VerboseWarnings::set_pedantic_mode $pedantic_p;
 345
 346 usage_error('Missing mandatory option -f')
 347         unless defined $files_from || defined $convert_from;
 348 $directory = '.' unless defined $directory;
 349
 350 usage_error('You cannot specify both --convert-from and --files-from')
 351         if defined $convert_from && defined $files_from;
 352
 353 if (defined $output && $output ne '-') {
 354     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 355         open($OUTPUT, '>', $output) || die "$output: $!\n";
 356 } else {
 357     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 358     open($OUTPUT, ">&STDOUT");
 359 }
 360
 361 if (defined $files_from) {
 362     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 363     open(my $INPUT, '<', $files_from) || die "$files_from: $!\n";
 364     while (<$INPUT>) {
 365         chomp;
 366         my $input = /^\//? $_: "$directory/$_";
 367         my $h = TmplTokenizer->new( $input );
 368         $h->set_allow_cformat( 1 );
 369         VerboseWarnings::set_input_file_name $input;
 370         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 371         text_extract( $h );
 372     }
 373     close $INPUT;
 374 } else {
 375     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 376     convert_translation_file;
 377 }
 378 generate_po_file;
 379
 380 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 381         if TmplTokenizer::syntaxerror_p;
 382
 383
 384 exit(-1) if TmplTokenizer::fatal_p;
 385
 386 ###############################################################################
 387
 388 =head1 DESCRIPTION
 389
 390 This is an experimental script based on the modularized
 391 text-extract2.pl script.  It has behaviour similar to
 392 xgettext(1), and generates gettext-compatible output files.
 393
 394 A gettext-like format provides the following advantages:
 395
 396 =over
 397
 398 =item -
 399
 400 Translation to non-English-like languages with different word
 401 order:  gettext's c-format strings can theoretically be
 402 emulated if we are able to do some analysis on the .tt input
 403 and treat <TMPL_VAR> in a way similar to %s.
 404
 405 =item -
 406
 407 Context for the extracted strings:  the gettext format provides
 408 the filenames and line numbers where each string can be found.
 409 The translator can read the source file and see the context,
 410 in case the string by itself can mean several different things.
 411
 412 =item -
 413
 414 Place for the translator to add comments about the translations.
 415
 416 =item -
 417
 418 Gettext-compatible tools, if any, might be usable if we adopt
 419 the gettext format.
 420
 421 =back
 422
 423 This script has already been in use for over a year and should
 424 be reasonable stable. Nevertheless, it is still somewhat
 425 experimental and there are still some issues.
 426
 427 Please refer to the explanation in tmpl_process3 for further
 428 details.
 429
 430 If you want to generate GNOME-style POTFILES.in files, such
 431 files (passed to -f) can be generated thus:
 432
 433     (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 434         -name \*.inc -o -name \*.tt) > opac/POTFILES.in
 435     (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 436         -name \*.inc -o -name \*.tt) > intranet/POTFILES.in
 437
 438 This is, however, quite pointless, because the "create" and
 439 "update" actions have already been implemented in tmpl_process3.pl.
 440
 441 =head2 Strings inside JavaScript
 442
 443 In the SCRIPT elements, the script will attempt to scan for
 444 _("I<string literal>") patterns, and extract the I<string literal>
 445 as a translatable string.
 446
 447 Note that the C-like _(...) notation is required.
 448
 449 The JavaScript must actually define a _ function
 450 so that the code remains correct JavaScript.
 451 A suitable definition of such a function can be
 452
 453         function _(s) { return s } // dummy function for gettext
 454
 455 =head1 SEE ALSO
 456
 457 tmpl_process3.pl,
 458 xgettext(1),
 459 Locale::PO(3),
 460 translator_doc.txt
 461
 462 =head1 BUGS
 463
 464 There probably are some. Bugs related to scanning of <INPUT>
 465 tags seem to be especially likely to be present.
 466
 467 Its diagnostics are probably too verbose.
 468
 469 When a <TMPL_VAR> within a JavaScript-related attribute is
 470 detected, the script currently displays no warnings at all.
 471 It might be good to display some kind of warning.
 472
 473 Its sort order (-s option) seems to be different than the real
 474 xgettext(1)'s sort option. This will result in translation
 475 strings inside the generated PO file spuriously moving about
 476 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 477
 478 If a Javascript string has leading spaces, it will
 479 generate strings with spurious leading spaces,
 480 leading to failure to match the strings when actually generating
 481 translated files.
 482
 483 =cut