misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use POSIX;
  12 use Locale::PO;
  13 use TmplTokenizer;
  14 use VerboseWarnings;
  15
  16 use vars qw( $convert_from );
  17 use vars qw( $files_from $directory $output $sort );
  18 use vars qw( $extract_all_p );
  19 use vars qw( $pedantic_p );
  20 use vars qw( %text %translation );
  21 use vars qw( $charset_in $charset_out );
  22 use vars qw( $disable_fuzzy_p );
  23 use vars qw( $verbose_p );
  24 use vars qw( $po_mode_p );
  25
  26 ###############################################################################
  27
  28 sub string_negligible_p ($) {
  29     my($t) = @_;                                # a string
  30     # Don't emit pure whitespace, pure numbers, pure punctuation,
  31     # single letters, or TMPL_VAR's.
  32     # Punctuation should arguably be translated. But without context
  33     # they are untranslatable. Note that $t is a string, not a token object.
  34     return !$extract_all_p && (
  35                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  36             || $t =~ /^\d+$/                    # purely digits
  37             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  38             || $t =~ /^[A-Za-z]$/               # single letters
  39         )
  40 }
  41
  42 sub token_negligible_p( $ ) {
  43     my($x) = @_;
  44     my $t = $x->type;
  45     return !$extract_all_p && (
  46             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  47             $t == TmplTokenType::DIRECTIVE? 1:
  48             $t == TmplTokenType::TEXT_PARAMETRIZED
  49                 && join( '', map { my $t = $_->type;
  50                         $t == TmplTokenType::DIRECTIVE?
  51                                 '1': $t == TmplTokenType::TAG?
  52                                         '': token_negligible_p( $_ )?
  53                                         '': '1' } @{$x->children} ) eq '' );
  54 }
  55
  56 ###############################################################################
  57
  58 sub remember ($$) {
  59     my($token, $string) = @_;
  60     # If we determine that the string is negligible, don't bother to remember
  61     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  62         my $key = TmplTokenizer::string_canon( $string );
  63         $text{$key} = [] unless defined $text{$key};
  64         push @{$text{$key}}, $token;
  65     }
  66 }
  67
  68 ###############################################################################
  69
  70 sub string_list () {
  71     my @t = keys %text;
  72     # The real gettext tools seems to sort case sensitively; I don't know why
  73     @t = sort { $a cmp $b } @t if $sort eq 's';
  74     @t = sort {
  75             my @aa = sort { $a->pathname cmp $b->pathname
  76                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  77             my @bb = sort { $a->pathname cmp $b->pathname
  78                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  79             $aa[0]->pathname cmp $bb[0]->pathname
  80                     || $aa[0]->line_number <=> $bb[0]->line_number;
  81         } @t if $sort eq 'F';
  82     return @t;
  83 }
  84
  85 ###############################################################################
  86
  87 sub text_extract (*) {
  88     my($h) = @_;
  89     for (;;) {
  90         my $s = TmplTokenizer::next_token $h;
  91     last unless defined $s;
  92         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  93         if ($kind eq TmplTokenType::TEXT) {
  94             remember( $s, $t ) if $t =~ /\S/s;
  95         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  96             remember( $s, $s->form ) if $s->form =~ /\S/s;
  97         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  98             # value [tag=input], meta
  99             my $tag = lc($1) if $t =~ /^<(\S+)/s;
 100             for my $a ('alt', 'content', 'title', 'value') {
 101                 if ($attr->{$a}) {
 102                     next if $a eq 'content' && $tag ne 'meta';
 103                     next if $a eq 'value' && ($tag ne 'input'
 104                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
 105                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 106                     $val = TmplTokenizer::trim $val;
 107                     remember( $s, $val ) if $val =~ /\S/s;
 108                 }
 109             }
 110         } elsif ($s->has_js_data) {
 111             for my $t (@{$s->js_data}) {
 112                 remember( $s, $t->[3] ) if $t->[0]; # FIXME
 113             }
 114         }
 115     }
 116 }
 117
 118 ###############################################################################
 119
 120 sub generate_strings_list () {
 121     # Emit all extracted strings.
 122     for my $t (string_list) {
 123         printf OUTPUT "%s\n", $t;
 124     }
 125 }
 126
 127 ###############################################################################
 128
 129 sub generate_po_file () {
 130     # We don't emit the Plural-Forms header; it's meaningless for us
 131     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 132     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 133     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 134     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 135     my $time_pot = $time;
 136     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 137     print OUTPUT <<EOF;
 138 # SOME DESCRIPTIVE TITLE.
 139 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 140 # This file is distributed under the same license as the PACKAGE package.
 141 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 142 #
 143 EOF
 144     print OUTPUT <<EOF unless $disable_fuzzy_p;
 145 #, fuzzy
 146 EOF
 147     print OUTPUT <<EOF;
 148 msgid ""
 149 msgstr ""
 150 "Project-Id-Version: PACKAGE VERSION\\n"
 151 "POT-Creation-Date: $time_pot\\n"
 152 "PO-Revision-Date: $time_po\\n"
 153 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 154 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 155 "MIME-Version: 1.0\\n"
 156 "Content-Type: text/plain; charset=$pot_charset\\n"
 157 "Content-Transfer-Encoding: 8bit\\n"
 158
 159 EOF
 160     my $directory_re = quotemeta("$directory/");
 161     for my $t (string_list) {
 162         if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
 163             my($token, $n) = ($text{$t}->[0], 0);
 164             printf OUTPUT "#. For the first occurrence,\n"
 165                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 166             for my $param ($token->parameters_and_fields) {
 167                 $n += 1;
 168                 my $type = $param->type;
 169                 my $subtype = ($type == TmplTokenType::TAG
 170                         && $param->string =~ /^<input\b/is?
 171                                 $param->attributes->{'type'}->[1]: undef);
 172                 my $fmt = TmplTokenizer::_formalize( $param );
 173                 $fmt =~ s/^%/%$n\$/;
 174                 if ($type == TmplTokenType::DIRECTIVE) {
 175                     $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
 176                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 177                             $2: undef;
 178                     printf OUTPUT "#. %s: %s\n", $fmt,
 179                         "$type" . (defined $name? " name=$name": '');
 180                 } else {
 181                     my $name = $param->attributes->{'name'};
 182                     my $value = $param->attributes->{'value'}
 183                             unless $subtype =~ /^(?:text)$/;
 184                     printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 185                             . (defined $name?  " name=$name->[1]": '')
 186                             . (defined $value? " value=$value->[1]": '');
 187                 }
 188             }
 189         } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
 190             my($token) = ($text{$t}->[0]);
 191             printf OUTPUT "#. For the first occurrence,\n"
 192                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 193             if ($token->string =~ /^<meta\b/is) {
 194                 my $type = $token->attributes->{'http-equiv'}->[1];
 195                 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
 196             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 197                 my $tag = uc($1);
 198                 my $type = (lc($tag) eq 'input'?
 199                         $token->attributes->{'type'}: undef);
 200                 my $name = $token->attributes->{'name'};
 201                 printf OUTPUT "#. %s\n", $tag
 202                     . (defined $type? " type=$type->[1]": '')
 203                     . (defined $name? " name=$name->[1]": '');
 204             }
 205         } elsif ($text{$t}->[0]->has_js_data) {
 206             printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 207             printf OUTPUT "#. SCRIPT\n";
 208         }
 209         my $cformat_p;
 210         for my $token (@{$text{$t}}) {
 211             my $pathname = $token->pathname;
 212             $pathname =~ s/^$directory_re//os;
 213             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 214                     if defined $pathname && defined $token->line_number;
 215             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 216         }
 217         printf OUTPUT "#, c-format\n" if $cformat_p;
 218         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 219                 TmplTokenizer::string_canon
 220                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 221         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 222                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 223     }
 224 }
 225
 226 ###############################################################################
 227
 228 sub convert_translation_file () {
 229     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 230     VerboseWarnings::set_input_file_name $convert_from;
 231     while (<INPUT>) {
 232         chomp;
 233         my($msgid, $msgstr) = split(/\t/);
 234         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 235                 unless defined $msgstr;
 236
 237         # Fixup some of the bad strings
 238         $msgid =~ s/^SELECTED>//;
 239
 240         # Create dummy token
 241         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 242         remember( $token, $msgid );
 243         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 244         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 245
 246         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 247             my $candidate = TmplTokenizer::charset_canon $2;
 248             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 249                     if defined $charset_in && $charset_in ne $candidate;
 250             $charset_in = $candidate;
 251         }
 252         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 253             my $candidate = TmplTokenizer::charset_canon $2;
 254             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 255                     if defined $charset_out && $charset_out ne $candidate;
 256             $charset_out = $candidate;
 257         }
 258     }
 259     # The following assumption is correct; that's what HTML::Template assumes
 260     if (!defined $charset_in) {
 261         $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
 262         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 263     }
 264 }
 265
 266 ###############################################################################
 267
 268 sub usage ($) {
 269     my($exitcode) = @_;
 270     my $h = $exitcode? *STDERR: *STDOUT;
 271     print $h <<EOF;
 272 Usage: $0 [OPTIONS]
 273 Extract translatable strings from given HTML::Template input files.
 274
 275 Input file location:
 276   -f, --files-from=FILE          Get list of input files from FILE
 277   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 278
 279 Output file location:
 280   -o, --output=FILE              Write output to specified file
 281
 282 HTML::Template options:
 283   -a, --extract-all              Extract all strings
 284       --pedantic-warnings        Issue warnings even for detected problems
 285                                  which are likely to be harmless
 286
 287 Output details:
 288   -s, --sort-output              generate sorted output
 289   -F, --sort-by-file             sort output by file location
 290   -v, --verbose                  explain what is being done
 291
 292 Informative output:
 293       --help                     Display this help and exit
 294
 295 Try `perldoc $0' for perhaps more information.
 296 EOF
 297     exit($exitcode);
 298 }
 299
 300 ###############################################################################
 301
 302 sub usage_error (;$) {
 303     print STDERR "$_[0]\n" if @_;
 304     print STDERR "Try `$0 --help' for more information.\n";
 305     exit(-1);
 306 }
 307
 308 ###############################################################################
 309
 310 Getopt::Long::config qw( bundling no_auto_abbrev );
 311 GetOptions(
 312     'a|extract-all'                     => \$extract_all_p,
 313     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 314     'convert-from=s'                    => \$convert_from,
 315     'D|directory=s'                     => \$directory,
 316     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 317     'f|files-from=s'                    => \$files_from,
 318     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 319     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 320     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 321     'output|o=s'                        => \$output,
 322     'po-mode'                           => \$po_mode_p,         # INTERNAL
 323     's|sort-output'                     => sub { $sort = 's' },
 324     'F|sort-by-file'                    => sub { $sort = 'F' },
 325     'v|verbose'                         => \$verbose_p,
 326     'help'                              => sub { usage(0) },
 327 ) || usage_error;
 328
 329 VerboseWarnings::set_application_name $0;
 330 VerboseWarnings::set_pedantic_mode $pedantic_p;
 331
 332 usage_error('Missing mandatory option -f')
 333         unless defined $files_from || defined $convert_from;
 334 $directory = '.' unless defined $directory;
 335
 336 usage_error('You cannot specify both --convert-from and --files-from')
 337         if defined $convert_from && defined $files_from;
 338
 339 if (defined $output && $output ne '-') {
 340     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 341     open(OUTPUT, ">$output") || die "$output: $!\n";
 342 } else {
 343     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 344     open(OUTPUT, ">&STDOUT");
 345 }
 346
 347 if (defined $files_from) {
 348     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 349     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 350     while (<INPUT>) {
 351         chomp;
 352         my $input = /^\//? $_: "$directory/$_";
 353         my $h = TmplTokenizer->new( $input );
 354         $h->set_allow_cformat( 1 );
 355         VerboseWarnings::set_input_file_name $input;
 356         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 357         text_extract( $h );
 358     }
 359     close INPUT;
 360 } else {
 361     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 362     convert_translation_file;
 363 }
 364 generate_po_file;
 365
 366 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 367         if TmplTokenizer::syntaxerror_p;
 368
 369
 370 exit(-1) if TmplTokenizer::fatal_p;
 371
 372 ###############################################################################
 373
 374 =head1 DESCRIPTION
 375
 376 This is an experimental script based on the modularized
 377 text-extract2.pl script.  It has behaviour similar to
 378 xgettext(1), and generates gettext-compatible output files.
 379
 380 A gettext-like format provides the following advantages:
 381
 382 =over
 383
 384 =item -
 385
 386 Translation to non-English-like languages with different word
 387 order:  gettext's c-format strings can theoretically be
 388 emulated if we are able to do some analysis on the .tmpl input
 389 and treat <TMPL_VAR> in a way similar to %s.
 390
 391 =item -
 392
 393 Context for the extracted strings:  the gettext format provides
 394 the filenames and line numbers where each string can be found.
 395 The translator can read the source file and see the context,
 396 in case the string by itself can mean several different things.
 397
 398 =item -
 399
 400 Place for the translator to add comments about the translations.
 401
 402 =item -
 403
 404 Gettext-compatible tools, if any, might be usable if we adopt
 405 the gettext format.
 406
 407 =back
 408
 409 This script has already been in use for over a year and should
 410 be reasonable stable. Nevertheless, it is still somewhat
 411 experimental and there are still some issues.
 412
 413 Please refer to the explanation in tmpl_process3 for further
 414 details.
 415
 416 If you want to generate GNOME-style POTFILES.in files, such
 417 files (passed to -f) can be generated thus:
 418
 419         (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 420                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 421         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 422                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 423
 424 This is, however, quite pointless, because the "create" and
 425 "update" actions have already been implemented in tmpl_process3.pl.
 426
 427 =head2 Strings inside JavaScript
 428
 429 In the SCRIPT elements, the script will attempt to scan for
 430 _("I<string literal>") patterns, and extract the I<string literal>
 431 as a translatable string.
 432
 433 Note that the C-like _(...) notation is required.
 434
 435 The JavaScript must actually define a _ function
 436 so that the code remains correct JavaScript.
 437 A suitable definition of such a function can be
 438
 439         function _(s) { return s } // dummy function for gettext
 440
 441 =head1 SEE ALSO
 442
 443 tmpl_process3.pl,
 444 xgettext(1),
 445 Locale::PO(3),
 446 translator_doc.txt
 447
 448 =head1 BUGS
 449
 450 There probably are some. Bugs related to scanning of <INPUT>
 451 tags seem to be especially likely to be present.
 452
 453 Its diagnostics are probably too verbose.
 454
 455 When a <TMPL_VAR> within a JavaScript-related attribute is
 456 detected, the script currently displays no warnings at all.
 457 It might be good to display some kind of warning.
 458
 459 Its sort order (-s option) seems to be different than the real
 460 xgettext(1)'s sort option. This will result in translation
 461 strings inside the generated PO file spuriously moving about
 462 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 463
 464 If a Javascript string has leading spaces, it will
 465 generate strings with spurious leading spaces,
 466 leading to failure to match the strings when actually generating
 467 translated files.
 468
 469 =cut