Bug 12595: (regression tests)
[koha.git] / misc / translator / tmpl_process3.pl
blobeea5f3e2084f73a015d4c75deee339d01a238145
1 #!/usr/bin/perl
2 # This file is part of Koha
3 # Parts copyright 2003-2004 Paul Poulain
4 # Parts copyright 2003-2004 Jerome Vizcaino
5 # Parts copyright 2004 Ambrose Li
7 =head1 NAME
9 tmpl_process3.pl - Alternative version of tmpl_process.pl
10 using gettext-compatible translation files
12 =cut
14 use strict;
15 #use warnings; FIXME - Bug 2505
16 use File::Basename;
17 use Getopt::Long;
18 use Locale::PO;
19 use File::Temp qw( :POSIX );
20 use TmplTokenizer;
21 use VerboseWarnings qw( :warn :die );
23 ###############################################################################
25 use vars qw( $in_dir @filenames $str_file $out_dir $quiet );
26 use vars qw( @excludes $exclude_regex );
27 use vars qw( $recursive_p );
28 use vars qw( $pedantic_p );
29 use vars qw( $href );
30 use vars qw( $type ); # file extension (DOS form without the dot) to match
31 use vars qw( $charset_in $charset_out );
33 ###############################################################################
35 sub find_translation ($) {
36 my($s) = @_;
37 my $key = $s;
38 if ($s =~ /\S/s) {
39 $key = TmplTokenizer::string_canon($key);
40 $key = TmplTokenizer::charset_convert($key, $charset_in, $charset_out);
41 $key = TmplTokenizer::quote_po($key);
43 if (defined $href->{$key} && !$href->{$key}->fuzzy && length Locale::PO->dequote($href->{$key}->msgstr)){
44 if ($s =~ /^(\s+)/){
45 return $1 . Locale::PO->dequote($href->{$key}->msgstr);
47 else {
48 return Locale::PO->dequote($href->{$key}->msgstr);
51 else {
52 return $s;
56 sub text_replace_tag ($$) {
57 my($t, $attr) = @_;
58 my $it;
60 # value [tag=input], meta
61 my $tag = lc($1) if $t =~ /^<(\S+)/s;
62 my $translated_p = 0;
63 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
64 if ($attr->{$a}) {
65 next if $a eq 'label' && $tag ne 'optgroup';
66 next if $a eq 'content' && $tag ne 'meta';
67 next if $a eq 'value' && ($tag ne 'input' || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:checkbox|hidden|radio|text)$/)); # FIXME
69 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
70 if ($val =~ /\S/s) {
71 my $s = find_translation($val);
72 if ($attr->{$a}->[1] ne $s) { #FIXME
73 $attr->{$a}->[1] = $s; # FIXME
74 $attr->{$a}->[2] = ($s =~ /"/s)? "'$s'": "\"$s\""; #FIXME
75 $translated_p = 1;
80 if ($translated_p) {
81 $it = "<$tag"
82 . join('', map { if ($_ ne '/'){
83 sprintf(' %s="%s"', $_, $attr->{$_}->[1]);
85 else {
86 sprintf(' %s',$_);
89 } sort {
90 $attr->{$a}->[3] <=> $attr->{$b}->[3] #FIXME
91 } keys %$attr);
92 $it .= '>';
94 else {
95 $it = $t;
97 return $it;
100 sub text_replace (**) {
101 my($h, $output) = @_;
102 for (;;) {
103 my $s = TmplTokenizer::next_token $h;
104 last unless defined $s;
105 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
106 if ($kind eq C4::TmplTokenType::TEXT) {
107 print $output find_translation($t);
108 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
109 my $fmt = find_translation($s->form);
110 print $output TmplTokenizer::parametrize($fmt, 1, $s, sub {
111 $_ = $_[0];
112 my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
113 $kind == C4::TmplTokenType::TAG && %$attr?
114 text_replace_tag($t, $attr): $t });
115 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
116 print $output text_replace_tag($t, $attr);
117 } elsif ($s->has_js_data) {
118 for my $t (@{$s->js_data}) {
119 # FIXME for this whole block
120 if ($t->[0]) {
121 printf $output "%s%s%s", $t->[2], find_translation $t->[3],
122 $t->[2];
123 } else {
124 print $output $t->[1];
127 } elsif (defined $t) {
128 # Quick fix to bug 4472
129 $t = "<!DOCTYPE stylesheet [" if $t =~ /DOCTYPE stylesheet/ ;
130 print $output $t;
135 sub listfiles {
136 my($dir, $type, $action, $filenames) = @_;
137 my @it = ();
138 if (opendir(DIR, $dir)) {
139 my @dirent = readdir DIR; # because DIR is shared when recursing
140 closedir DIR;
141 for my $dirent (@dirent) {
142 my $path = "$dir/$dirent";
143 if ($dirent =~ /^\./ || $dirent eq 'CVS' || $dirent eq 'RCS'
144 || (defined $exclude_regex && $dirent =~ /^(?:$exclude_regex)$/)) {
146 } elsif (-f $path) {
147 my $basename = basename $path;
148 push @it, $path
149 if ( not @$filenames or ( grep { $path =~ /$_/ } @$filenames ) )
150 and (!defined $type || $dirent =~ /\.(?:$type)$/) || $action eq 'install';
151 } elsif (-d $path && $recursive_p) {
152 push @it, listfiles($path, $type, $action, $filenames);
155 } else {
156 warn_normal "$dir: $!", undef;
158 return @it;
161 ###############################################################################
163 sub mkdir_recursive ($) {
164 my($dir) = @_;
165 local($`, $&, $', $1);
166 $dir = $` if $dir ne /^\/+$/ && $dir =~ /\/+$/;
167 my ($prefix, $basename) = ($dir =~ /\/([^\/]+)$/s)? ($`, $1): ('.', $dir);
168 mkdir_recursive($prefix) if $prefix ne '.' && !-d $prefix;
169 if (!-d $dir) {
170 print STDERR "Making directory $dir..." unless $quiet;
171 # creates with rwxrwxr-x permissions
172 mkdir($dir, 0775) || warn_normal "$dir: $!", undef;
176 ###############################################################################
178 sub usage ($) {
179 my($exitcode) = @_;
180 my $h = $exitcode? *STDERR: *STDOUT;
181 print $h <<EOF;
182 Usage: $0 create [OPTION]
183 or: $0 update [OPTION]
184 or: $0 install [OPTION]
185 or: $0 --help
186 Create or update PO files from templates, or install translated templates.
188 -i, --input=SOURCE Get or update strings from SOURCE directory.
189 -o, --outputdir=DIRECTORY Install translation(s) to specified DIRECTORY
190 --pedantic-warnings Issue warnings even for detected problems
191 which are likely to be harmless
192 -r, --recursive SOURCE in the -i option is a directory
193 -f, --filename=FILE FILE is a specific filaneme.
194 If given, only these files will be processed.
195 -s, --str-file=FILE Specify FILE as the translation (po) file
196 for input (install) or output (create, update)
197 -x, --exclude=REGEXP Exclude files matching the given REGEXP
198 --help Display this help and exit
199 -q, --quiet no output to screen (except for errors)
201 The -o option is ignored for the "create" and "update" actions.
202 Try `perldoc $0 for perhaps more information.
204 exit($exitcode);
207 ###############################################################################
209 sub usage_error (;$) {
210 for my $msg (split(/\n/, $_[0])) {
211 print STDERR "$msg\n";
213 print STDERR "Try `$0 --help for more information.\n";
214 exit(-1);
217 ###############################################################################
219 GetOptions(
220 'input|i=s' => \$in_dir,
221 'filename|f=s' => \@filenames,
222 'outputdir|o=s' => \$out_dir,
223 'recursive|r' => \$recursive_p,
224 'str-file|s=s' => \$str_file,
225 'exclude|x=s' => \@excludes,
226 'quiet|q' => \$quiet,
227 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
228 'help' => \&usage,
229 ) || usage_error;
231 VerboseWarnings::set_application_name $0;
232 VerboseWarnings::set_pedantic_mode $pedantic_p;
234 # keep the buggy Locale::PO quiet if it says stupid things
235 $SIG{__WARN__} = sub {
236 my($s) = @_;
237 print STDERR $s unless $s =~ /^Strange line in [^:]+: #~/s
240 my $action = shift or usage_error('You must specify an ACTION.');
241 usage_error('You must at least specify input and string list filenames.')
242 if !$in_dir || !defined $str_file;
244 # Type match defaults to *.tt plus *.inc if not specified
245 $type = "tt|inc|xsl|xml|def" if !defined($type);
247 # Check the inputs for being directories
248 usage_error("$in_dir: Input must be a directory.\n"
249 . "(Symbolic links are not supported at the moment)")
250 unless -d $in_dir;
252 # Generates the global exclude regular expression
253 $exclude_regex = '(?:'.join('|', @excludes).')' if @excludes;
255 my @in_files;
256 # Generate the list of input files if a directory is specified
257 # input is a directory, generates list of files to process
258 $in_dir =~ s/\/$//; # strips the trailing / if any
260 for my $fn ( @filenames ) {
261 die "You cannot specify input files and directories at the same time.\n"
262 if -d $fn;
264 @in_files = listfiles($in_dir, $type, $action, \@filenames);
266 # restores the string list from file
267 $href = Locale::PO->load_file_ashash($str_file);
269 # guess the charsets. HTML::Templates defaults to iso-8859-1
270 if (defined $href) {
271 die "$str_file: PO file is corrupted, or not a PO file\n" unless defined $href->{'""'};
272 $charset_out = TmplTokenizer::charset_canon $2 if $href->{'""'}->msgstr =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/;
273 $charset_in = $charset_out;
274 # for my $msgid (keys %$href) {
275 # if ($msgid =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/) {
276 # my $candidate = TmplTokenizer::charset_canon $2;
277 # die "Conflicting charsets in msgid: $charset_in vs $candidate => $msgid\n"
278 # if defined $charset_in && $charset_in ne $candidate;
279 # $charset_in = $candidate;
283 # BUG6464: check consistency of PO messages
284 # - count number of '%s' in msgid and msgstr
285 for my $msg ( values %$href ) {
286 my $id_count = split(/%s/, $msg->{msgid}) - 1;
287 my $str_count = split(/%s/, $msg->{msgstr}) - 1;
288 next if $id_count == $str_count ||
289 $msg->{msgstr} eq '""' ||
290 grep { /fuzzy/ } @{$msg->{_flags}};
291 warn_normal
292 "unconsistent %s count: ($id_count/$str_count):\n" .
293 " line: " . $msg->{loaded_line_number} . "\n" .
294 " msgid: " . $msg->{msgid} . "\n" .
295 " msgstr: " . $msg->{msgstr} . "\n", undef;
299 # set our charset in to UTF-8
300 if (!defined $charset_in) {
301 $charset_in = TmplTokenizer::charset_canon 'UTF-8';
302 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
304 # set our charset out to UTF-8
305 if (!defined $charset_out) {
306 $charset_out = TmplTokenizer::charset_canon 'UTF-8';
307 warn "Warning: Charset Out defaulting to $charset_out\n";
309 my $xgettext = './xgettext.pl'; # actual text extractor script
310 my $st;
312 if ($action eq 'create') {
313 # updates the list. As the list is empty, every entry will be added
314 if (!-s $str_file) {
315 warn "Removing empty file $str_file\n";
316 unlink $str_file || die "$str_file: $!\n";
318 die "$str_file: Output file already exists\n" if -f $str_file;
319 my($tmph1, $tmpfile1) = tmpnam();
320 my($tmph2, $tmpfile2) = tmpnam();
321 close $tmph2; # We just want a name
322 # Generate the temporary file that acts as <MODULE>/POTFILES.in
323 for my $input (@in_files) {
324 print $tmph1 "$input\n";
326 close $tmph1;
327 warn "I $charset_in O $charset_out";
328 # Generate the specified po file ($str_file)
329 $st = system ($xgettext, '-s', '-f', $tmpfile1, '-o', $tmpfile2,
330 (defined $charset_in? ('-I', $charset_in): ()),
331 (defined $charset_out? ('-O', $charset_out): ())
333 # Run msgmerge so that the pot file looks like a real pot file
334 # We need to help msgmerge a bit by pre-creating a dummy po file that has
335 # the headers and the "" msgid & msgstr. It will fill in the rest.
336 if ($st == 0) {
337 # Merge the temporary "pot file" with the specified po file ($str_file)
338 # FIXME: msgmerge(1) is a Unix dependency
339 # FIXME: need to check the return value
340 unless (-f $str_file) {
341 local(*INPUT, *OUTPUT);
342 open(INPUT, "<$tmpfile2");
343 open(OUTPUT, ">$str_file");
344 while (<INPUT>) {
345 print OUTPUT;
346 last if /^\n/s;
348 close INPUT;
349 close OUTPUT;
351 $st = system("msgmerge -U ".($quiet?'-q':'')." -s $str_file $tmpfile2");
352 } else {
353 error_normal "Text extraction failed: $xgettext: $!\n", undef;
354 error_additional "Will not run msgmerge\n", undef;
356 unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef;
357 unlink $tmpfile2 || warn_normal "$tmpfile2: unlink failed: $!\n", undef;
359 } elsif ($action eq 'update') {
360 my($tmph1, $tmpfile1) = tmpnam();
361 my($tmph2, $tmpfile2) = tmpnam();
362 close $tmph2; # We just want a name
363 # Generate the temporary file that acts as <MODULE>/POTFILES.in
364 for my $input (@in_files) {
365 print $tmph1 "$input\n";
367 close $tmph1;
368 # Generate the temporary file that acts as <MODULE>/<LANG>.pot
369 $st = system($xgettext, '-s', '-f', $tmpfile1, '-o', $tmpfile2,
370 '--po-mode',
371 (defined $charset_in? ('-I', $charset_in): ()),
372 (defined $charset_out? ('-O', $charset_out): ()));
373 if ($st == 0) {
374 # Merge the temporary "pot file" with the specified po file ($str_file)
375 # FIXME: msgmerge(1) is a Unix dependency
376 # FIXME: need to check the return value
377 if ( @filenames ) {
378 my ($tmph3, $tmpfile3) = tmpnam();
379 $st = system("msgcat $str_file $tmpfile2 > $tmpfile3");
380 $st = system("msgmerge -U ".($quiet?'-q':'')." -s $str_file $tmpfile3")
381 unless $st;
382 } else {
383 $st = system("msgmerge -U ".($quiet?'-q':'')." -s $str_file $tmpfile2");
385 } else {
386 error_normal "Text extraction failed: $xgettext: $!\n", undef;
387 error_additional "Will not run msgmerge\n", undef;
389 unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef;
390 unlink $tmpfile2 || warn_normal "$tmpfile2: unlink failed: $!\n", undef;
392 } elsif ($action eq 'install') {
393 if(!defined($out_dir)) {
394 usage_error("You must specify an output directory when using the install method.");
397 if ($in_dir eq $out_dir) {
398 warn "You must specify a different input and output directory.\n";
399 exit -1;
402 # Make sure the output directory exists
403 # (It will auto-create it, but for compatibility we should not)
404 -d $out_dir || die "$out_dir: The directory does not exist\n";
406 # Try to open the file, because Locale::PO doesn't check :-/
407 open(INPUT, "<$str_file") || die "$str_file: $!\n";
408 close INPUT;
410 # creates the new tmpl file using the new translation
411 for my $input (@in_files) {
412 die "Assertion failed"
413 unless substr($input, 0, length($in_dir) + 1) eq "$in_dir/";
415 my $target = $out_dir . substr($input, length($in_dir));
416 my $targetdir = $` if $target =~ /[^\/]+$/s;
418 if (!defined $type || $input =~ /\.(?:$type)$/) {
419 my $h = TmplTokenizer->new( $input );
420 $h->set_allow_cformat( 1 );
421 VerboseWarnings::set_input_file_name $input;
422 mkdir_recursive($targetdir) unless -d $targetdir;
423 print STDERR "Creating $target...\n" unless $quiet;
424 open( OUTPUT, ">$target" ) || die "$target: $!\n";
425 text_replace( $h, *OUTPUT );
426 close OUTPUT;
427 } else {
428 # just copying the file
429 mkdir_recursive($targetdir) unless -d $targetdir;
430 system("cp -f $input $target");
431 print STDERR "Copying $input...\n" unless $quiet;
435 } else {
436 usage_error('Unknown action specified.');
439 if ($st == 0) {
440 printf "The %s seems to be successful.\n", $action unless $quiet;
441 } else {
442 printf "%s FAILED.\n", "\u$action" unless $quiet;
444 exit 0;
446 ###############################################################################
448 =head1 SYNOPSIS
450 ./tmpl_process3.pl [ I<tmpl_process.pl options> ]
452 =head1 DESCRIPTION
454 This is an alternative version of the tmpl_process.pl script,
455 using standard gettext-style PO files. While there still might
456 be changes made to the way it extracts strings, at this moment
457 it should be stable enough for general use; it is already being
458 used for the Chinese and Polish translations.
460 Currently, the create, update, and install actions have all been
461 reimplemented and seem to work.
463 =head2 Features
465 =over
467 =item -
469 Translation files in standard Uniforum PO format.
470 All standard tools including all gettext tools,
471 plus PO file editors like kbabel(1) etc.
472 can be used.
474 =item -
476 Minor changes in whitespace in source templates
477 do not generally require strings to be re-translated.
479 =item -
481 Able to handle <TMPL_VAR> variables in the templates;
482 <TMPL_VAR> variables are usually extracted in proper context,
483 represented by a short %s placeholder.
485 =item -
487 Able to handle text input and radio button INPUT elements
488 in the templates; these INPUT elements are also usually
489 extracted in proper context,
490 represented by a short %S or %p placeholder.
492 =item -
494 Automatic comments in the generated PO files to provide
495 even more context (line numbers, and the names and types
496 of the variables).
498 =item -
500 The %I<n>$s (or %I<n>$p, etc.) notation can be used
501 for change the ordering of the variables,
502 if such a reordering is required for correct translation.
504 =item -
506 If a particular <TMPL_VAR> should not appear in the
507 translation, it can be suppressed with the %0.0s notation.
509 =item -
511 Using the PO format also means translators can add their
512 own comments in the translation files, if necessary.
514 =item -
516 Create, update, and install actions are all based on the
517 same scanner module. This ensures that update and install
518 have the same idea of what is a translatable string;
519 attribute names in tags, for example, will not be
520 accidentally translated.
522 =back
524 =head1 NOTES
526 Anchors are represented by an <AI<n>> notation.
527 The meaning of this non-standard notation might not be obvious.
529 The create action calls xgettext.pl to do the actual work;
530 the update action calls xgettext.pl and msgmerge(1) to do the
531 actual work.
533 =head1 BUGS
535 xgettext.pl must be present in the current directory; the
536 msgmerge(1) command must also be present in the search path.
537 The script currently does not check carefully whether these
538 dependent commands are present.
540 Locale::PO(3) has a lot of bugs. It can neither parse nor
541 generate GNU PO files properly; a couple of workarounds have
542 been written in TmplTokenizer and more is likely to be needed
543 (e.g., to get rid of the "Strange line" warning for #~).
545 This script may not work in Windows.
547 There are probably some other bugs too, since this has not been
548 tested very much.
550 =head1 SEE ALSO
552 xgettext.pl,
553 TmplTokenizer.pm,
554 msgmerge(1),
555 Locale::PO(3),
556 translator_doc.txt
558 http://www.saas.nsw.edu.au/koha_wiki/index.php?page=DifficultTerms
560 =cut