misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use warnings;
  11 use Getopt::Long;
  12 use POSIX;
  13 use Locale::PO;
  14 use TmplTokenizer;
  15 use VerboseWarnings;
  16
  17 use vars qw( $convert_from );
  18 use vars qw( $files_from $directory $output $sort );
  19 use vars qw( $extract_all_p );
  20 use vars qw( $pedantic_p );
  21 use vars qw( %text %translation );
  22 use vars qw( $charset_in $charset_out );
  23 use vars qw( $disable_fuzzy_p );
  24 use vars qw( $verbose_p );
  25 use vars qw( $po_mode_p );
  26
  27 ###############################################################################
  28
  29 sub string_negligible_p ($) {
  30     my($t) = @_;                                # a string
  31     # Don't emit pure whitespace, pure numbers, pure punctuation,
  32     # single letters, or TMPL_VAR's.
  33     # Punctuation should arguably be translated. But without context
  34     # they are untranslatable. Note that $t is a string, not a token object.
  35     return !$extract_all_p && (
  36                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  37             || $t =~ /^\d+$/                    # purely digits
  38             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  39             || $t =~ /^[A-Za-z]$/               # single letters
  40         )
  41 }
  42
  43 sub token_negligible_p( $ ) {
  44     my($x) = @_;
  45     my $t = $x->type;
  46     return !$extract_all_p && (
  47             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  48             $t == TmplTokenType::DIRECTIVE? 1:
  49             $t == TmplTokenType::TEXT_PARAMETRIZED
  50                 && join( '', map { my $t = $_->type;
  51                         $t == TmplTokenType::DIRECTIVE?
  52                                 '1': $t == TmplTokenType::TAG?
  53                                         '': token_negligible_p( $_ )?
  54                                         '': '1' } @{$x->children} ) eq '' );
  55 }
  56
  57 ###############################################################################
  58
  59 sub remember ($$) {
  60     my($token, $string) = @_;
  61     # If we determine that the string is negligible, don't bother to remember
  62     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  63         my $key = TmplTokenizer::string_canon( $string );
  64         $text{$key} = [] unless defined $text{$key};
  65         push @{$text{$key}}, $token;
  66     }
  67 }
  68
  69 ###############################################################################
  70
  71 sub string_list () {
  72     my @t = keys %text;
  73     # The real gettext tools seems to sort case sensitively; I don't know why
  74     @t = sort { $a cmp $b } @t if $sort eq 's';
  75     @t = sort {
  76             my @aa = sort { $a->pathname cmp $b->pathname
  77                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  78             my @bb = sort { $a->pathname cmp $b->pathname
  79                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  80             $aa[0]->pathname cmp $bb[0]->pathname
  81                     || $aa[0]->line_number <=> $bb[0]->line_number;
  82         } @t if $sort eq 'F';
  83     return @t;
  84 }
  85
  86 ###############################################################################
  87
  88 sub text_extract (*) {
  89     my($h) = @_;
  90     for (;;) {
  91         my $s = TmplTokenizer::next_token $h;
  92     last unless defined $s;
  93         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  94         if ($kind eq TmplTokenType::TEXT) {
  95             remember( $s, $t ) if $t =~ /\S/s;
  96         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  97             remember( $s, $s->form ) if $s->form =~ /\S/s;
  98         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  99             # value [tag=input], meta
 100             my $tag = lc($1) if $t =~ /^<(\S+)/s;
 101             for my $a ('alt', 'content', 'title', 'value','label') {
 102                 if ($attr->{$a}) {
 103             next if $a eq 'label' && $tag ne 'optgroup';
 104                     next if $a eq 'content' && $tag ne 'meta';
 105                     next if $a eq 'value' && ($tag ne 'input'
 106                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
 107                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 108                     $val = TmplTokenizer::trim $val;
 109                     remember( $s, $val ) if $val =~ /\S/s;
 110                 }
 111             }
 112         } elsif ($s->has_js_data) {
 113             for my $t (@{$s->js_data}) {
 114                 remember( $s, $t->[3] ) if $t->[0]; # FIXME
 115             }
 116         }
 117     }
 118 }
 119
 120 ###############################################################################
 121
 122 sub generate_strings_list () {
 123     # Emit all extracted strings.
 124     for my $t (string_list) {
 125         printf OUTPUT "%s\n", $t;
 126     }
 127 }
 128
 129 ###############################################################################
 130
 131 sub generate_po_file () {
 132     # We don't emit the Plural-Forms header; it's meaningless for us
 133     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 134     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 135     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 136     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 137     my $time_pot = $time;
 138     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 139     print OUTPUT <<EOF;
 140 # SOME DESCRIPTIVE TITLE.
 141 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 142 # This file is distributed under the same license as the PACKAGE package.
 143 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 144 #
 145 EOF
 146     print OUTPUT <<EOF unless $disable_fuzzy_p;
 147 #, fuzzy
 148 EOF
 149     print OUTPUT <<EOF;
 150 msgid ""
 151 msgstr ""
 152 "Project-Id-Version: PACKAGE VERSION\\n"
 153 "POT-Creation-Date: $time_pot\\n"
 154 "PO-Revision-Date: $time_po\\n"
 155 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 156 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 157 "MIME-Version: 1.0\\n"
 158 "Content-Type: text/plain; charset=$pot_charset\\n"
 159 "Content-Transfer-Encoding: 8bit\\n"
 160
 161 EOF
 162     my $directory_re = quotemeta("$directory/");
 163     for my $t (string_list) {
 164         if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
 165             my($token, $n) = ($text{$t}->[0], 0);
 166             printf OUTPUT "#. For the first occurrence,\n"
 167                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 168             for my $param ($token->parameters_and_fields) {
 169                 $n += 1;
 170                 my $type = $param->type;
 171                 my $subtype = ($type == TmplTokenType::TAG
 172                         && $param->string =~ /^<input\b/is?
 173                                 $param->attributes->{'type'}->[1]: undef);
 174                 my $fmt = TmplTokenizer::_formalize( $param );
 175                 $fmt =~ s/^%/%$n\$/;
 176                 if ($type == TmplTokenType::DIRECTIVE) {
 177                     $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
 178                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 179                             $2: undef;
 180                     printf OUTPUT "#. %s: %s\n", $fmt,
 181                         "$type" . (defined $name? " name=$name": '');
 182                 } else {
 183                     my $name = $param->attributes->{'name'};
 184                     my $value = $param->attributes->{'value'}
 185                             unless $subtype =~ /^(?:text)$/;
 186                     printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 187                             . (defined $name?  " name=$name->[1]": '')
 188                             . (defined $value? " value=$value->[1]": '');
 189                 }
 190             }
 191         } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
 192             my($token) = ($text{$t}->[0]);
 193             printf OUTPUT "#. For the first occurrence,\n"
 194                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 195             if ($token->string =~ /^<meta\b/is) {
 196                 my $type = $token->attributes->{'http-equiv'}->[1];
 197                 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
 198             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 199                 my $tag = uc($1);
 200                 my $type = (lc($tag) eq 'input'?
 201                         $token->attributes->{'type'}: undef);
 202                 my $name = $token->attributes->{'name'};
 203                 printf OUTPUT "#. %s\n", $tag
 204                     . (defined $type? " type=$type->[1]": '')
 205                     . (defined $name? " name=$name->[1]": '');
 206             }
 207         } elsif ($text{$t}->[0]->has_js_data) {
 208             printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 209             printf OUTPUT "#. SCRIPT\n";
 210         }
 211         my $cformat_p;
 212         for my $token (@{$text{$t}}) {
 213             my $pathname = $token->pathname;
 214             $pathname =~ s/^$directory_re//os;
 215             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 216                     if defined $pathname && defined $token->line_number;
 217             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 218         }
 219         printf OUTPUT "#, c-format\n" if $cformat_p;
 220         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 221                 TmplTokenizer::string_canon
 222                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 223         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 224                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 225     }
 226 }
 227
 228 ###############################################################################
 229
 230 sub convert_translation_file () {
 231     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 232     VerboseWarnings::set_input_file_name $convert_from;
 233     while (<INPUT>) {
 234         chomp;
 235         my($msgid, $msgstr) = split(/\t/);
 236         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 237                 unless defined $msgstr;
 238
 239         # Fixup some of the bad strings
 240         $msgid =~ s/^SELECTED>//;
 241
 242         # Create dummy token
 243         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 244         remember( $token, $msgid );
 245         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 246         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 247
 248         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 249             my $candidate = TmplTokenizer::charset_canon $2;
 250             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 251                     if defined $charset_in && $charset_in ne $candidate;
 252             $charset_in = $candidate;
 253         }
 254         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 255             my $candidate = TmplTokenizer::charset_canon $2;
 256             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 257                     if defined $charset_out && $charset_out ne $candidate;
 258             $charset_out = $candidate;
 259         }
 260     }
 261     # The following assumption is correct; that's what HTML::Template assumes
 262     if (!defined $charset_in) {
 263         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 264         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 265     }
 266 }
 267
 268 ###############################################################################
 269
 270 sub usage ($) {
 271     my($exitcode) = @_;
 272     my $h = $exitcode? *STDERR: *STDOUT;
 273     print $h <<EOF;
 274 Usage: $0 [OPTIONS]
 275 Extract translatable strings from given HTML::Template input files.
 276
 277 Input file location:
 278   -f, --files-from=FILE          Get list of input files from FILE
 279   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 280
 281 Output file location:
 282   -o, --output=FILE              Write output to specified file
 283
 284 HTML::Template options:
 285   -a, --extract-all              Extract all strings
 286       --pedantic-warnings        Issue warnings even for detected problems
 287                                  which are likely to be harmless
 288
 289 Output details:
 290   -s, --sort-output              generate sorted output
 291   -F, --sort-by-file             sort output by file location
 292   -v, --verbose                  explain what is being done
 293
 294 Informative output:
 295       --help                     Display this help and exit
 296
 297 Try `perldoc $0' for perhaps more information.
 298 EOF
 299     exit($exitcode);
 300 }
 301
 302 ###############################################################################
 303
 304 sub usage_error (;$) {
 305     print STDERR "$_[0]\n" if @_;
 306     print STDERR "Try `$0 --help' for more information.\n";
 307     exit(-1);
 308 }
 309
 310 ###############################################################################
 311
 312 Getopt::Long::config qw( bundling no_auto_abbrev );
 313 GetOptions(
 314     'a|extract-all'                     => \$extract_all_p,
 315     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 316     'convert-from=s'                    => \$convert_from,
 317     'D|directory=s'                     => \$directory,
 318     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 319     'f|files-from=s'                    => \$files_from,
 320     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 321     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 322     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 323     'output|o=s'                        => \$output,
 324     'po-mode'                           => \$po_mode_p,         # INTERNAL
 325     's|sort-output'                     => sub { $sort = 's' },
 326     'F|sort-by-file'                    => sub { $sort = 'F' },
 327     'v|verbose'                         => \$verbose_p,
 328     'help'                              => sub { usage(0) },
 329 ) || usage_error;
 330
 331 VerboseWarnings::set_application_name $0;
 332 VerboseWarnings::set_pedantic_mode $pedantic_p;
 333
 334 usage_error('Missing mandatory option -f')
 335         unless defined $files_from || defined $convert_from;
 336 $directory = '.' unless defined $directory;
 337
 338 usage_error('You cannot specify both --convert-from and --files-from')
 339         if defined $convert_from && defined $files_from;
 340
 341 if (defined $output && $output ne '-') {
 342     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 343     open(OUTPUT, ">$output") || die "$output: $!\n";
 344 } else {
 345     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 346     open(OUTPUT, ">&STDOUT");
 347 }
 348
 349 if (defined $files_from) {
 350     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 351     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 352     while (<INPUT>) {
 353         chomp;
 354         my $input = /^\//? $_: "$directory/$_";
 355         my $h = TmplTokenizer->new( $input );
 356         $h->set_allow_cformat( 1 );
 357         VerboseWarnings::set_input_file_name $input;
 358         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 359         text_extract( $h );
 360     }
 361     close INPUT;
 362 } else {
 363     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 364     convert_translation_file;
 365 }
 366 generate_po_file;
 367
 368 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 369         if TmplTokenizer::syntaxerror_p;
 370
 371
 372 exit(-1) if TmplTokenizer::fatal_p;
 373
 374 ###############################################################################
 375
 376 =head1 DESCRIPTION
 377
 378 This is an experimental script based on the modularized
 379 text-extract2.pl script.  It has behaviour similar to
 380 xgettext(1), and generates gettext-compatible output files.
 381
 382 A gettext-like format provides the following advantages:
 383
 384 =over
 385
 386 =item -
 387
 388 Translation to non-English-like languages with different word
 389 order:  gettext's c-format strings can theoretically be
 390 emulated if we are able to do some analysis on the .tmpl input
 391 and treat <TMPL_VAR> in a way similar to %s.
 392
 393 =item -
 394
 395 Context for the extracted strings:  the gettext format provides
 396 the filenames and line numbers where each string can be found.
 397 The translator can read the source file and see the context,
 398 in case the string by itself can mean several different things.
 399
 400 =item -
 401
 402 Place for the translator to add comments about the translations.
 403
 404 =item -
 405
 406 Gettext-compatible tools, if any, might be usable if we adopt
 407 the gettext format.
 408
 409 =back
 410
 411 This script has already been in use for over a year and should
 412 be reasonable stable. Nevertheless, it is still somewhat
 413 experimental and there are still some issues.
 414
 415 Please refer to the explanation in tmpl_process3 for further
 416 details.
 417
 418 If you want to generate GNOME-style POTFILES.in files, such
 419 files (passed to -f) can be generated thus:
 420
 421         (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 422                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 423         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 424                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 425
 426 This is, however, quite pointless, because the "create" and
 427 "update" actions have already been implemented in tmpl_process3.pl.
 428
 429 =head2 Strings inside JavaScript
 430
 431 In the SCRIPT elements, the script will attempt to scan for
 432 _("I<string literal>") patterns, and extract the I<string literal>
 433 as a translatable string.
 434
 435 Note that the C-like _(...) notation is required.
 436
 437 The JavaScript must actually define a _ function
 438 so that the code remains correct JavaScript.
 439 A suitable definition of such a function can be
 440
 441         function _(s) { return s } // dummy function for gettext
 442
 443 =head1 SEE ALSO
 444
 445 tmpl_process3.pl,
 446 xgettext(1),
 447 Locale::PO(3),
 448 translator_doc.txt
 449
 450 =head1 BUGS
 451
 452 There probably are some. Bugs related to scanning of <INPUT>
 453 tags seem to be especially likely to be present.
 454
 455 Its diagnostics are probably too verbose.
 456
 457 When a <TMPL_VAR> within a JavaScript-related attribute is
 458 detected, the script currently displays no warnings at all.
 459 It might be good to display some kind of warning.
 460
 461 Its sort order (-s option) seems to be different than the real
 462 xgettext(1)'s sort option. This will result in translation
 463 strings inside the generated PO file spuriously moving about
 464 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 465
 466 If a Javascript string has leading spaces, it will
 467 generate strings with spurious leading spaces,
 468 leading to failure to match the strings when actually generating
 469 translated files.
 470
 471 =cut