Bug 21728: DBRev 18.12.00.019
[koha.git] / misc / translator / tmpl_process3.pl
blob6f4c39e78632191aaf9ed0da7eb0d7766147310f
1 #!/usr/bin/perl
2 # This file is part of Koha
3 # Parts copyright 2003-2004 Paul Poulain
4 # Parts copyright 2003-2004 Jerome Vizcaino
5 # Parts copyright 2004 Ambrose Li
7 use FindBin;
8 use lib $FindBin::Bin;
10 =head1 NAME
12 tmpl_process3.pl - Alternative version of tmpl_process.pl
13 using gettext-compatible translation files
15 =cut
17 use strict;
18 #use warnings; FIXME - Bug 2505
19 use File::Basename;
20 use Getopt::Long;
21 use Locale::PO;
22 use File::Temp qw( :POSIX );
23 use TmplTokenizer;
24 use VerboseWarnings qw( :warn :die );
26 ###############################################################################
28 use vars qw( @in_dirs @filenames @match @nomatch $str_file $out_dir $quiet );
29 use vars qw( @excludes $exclude_regex );
30 use vars qw( $recursive_p );
31 use vars qw( $pedantic_p );
32 use vars qw( $href );
33 use vars qw( $type ); # file extension (DOS form without the dot) to match
34 use vars qw( $charset_in $charset_out );
36 ###############################################################################
38 sub find_translation ($) {
39 my($s) = @_;
40 my $key = $s;
41 if ($s =~ /\S/s) {
42 $key = TmplTokenizer::string_canon($key);
43 $key = TmplTokenizer::charset_convert($key, $charset_in, $charset_out);
44 $key = TmplTokenizer::quote_po($key);
46 if (defined $href->{$key} && !$href->{$key}->fuzzy && length Locale::PO->dequote($href->{$key}->msgstr)){
47 if ($s =~ /^(\s+)/){
48 return $1 . Locale::PO->dequote($href->{$key}->msgstr);
50 else {
51 return Locale::PO->dequote($href->{$key}->msgstr);
54 else {
55 return $s;
59 sub text_replace_tag ($$) {
60 my($t, $attr) = @_;
61 my $it;
63 # value [tag=input], meta
64 my $tag = lc($1) if $t =~ /^<(\S+)/s;
65 my $translated_p = 0;
66 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') {
67 if ($attr->{$a}) {
68 next if $a eq 'label' && $tag ne 'optgroup';
69 next if $a eq 'content' && $tag ne 'meta';
70 next if $a eq 'value' && ($tag ne 'input' || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:checkbox|hidden|radio)$/)); # FIXME
72 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
73 if ($val =~ /\S/s) {
74 my $s = find_translation($val);
75 if ($attr->{$a}->[1] ne $s) { #FIXME
76 $attr->{$a}->[1] = $s; # FIXME
77 $attr->{$a}->[2] = ($s =~ /"/s)? "'$s'": "\"$s\""; #FIXME
78 $translated_p = 1;
83 if ($translated_p) {
84 $it = "<$tag"
85 . join('', map { if ($_ ne '/'){
86 sprintf(' %s="%s"', $_, $attr->{$_}->[1]);
88 else {
89 sprintf(' %s',$_);
92 } sort {
93 $attr->{$a}->[3] <=> $attr->{$b}->[3] #FIXME
94 || $a cmp $b # Sort attributes BZ 22236
95 } keys %$attr);
96 $it .= '>';
98 else {
99 $it = $t;
101 return $it;
104 sub text_replace (**) {
105 my($h, $output) = @_;
106 for (;;) {
107 my $s = TmplTokenizer::next_token $h;
108 last unless defined $s;
109 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
110 if ($kind eq C4::TmplTokenType::TEXT) {
111 print $output find_translation($t);
112 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
113 my $fmt = find_translation($s->form);
114 print $output TmplTokenizer::parametrize($fmt, 1, $s, sub {
115 $_ = $_[0];
116 my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
117 $kind == C4::TmplTokenType::TAG && %$attr?
118 text_replace_tag($t, $attr): $t });
119 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
120 print $output text_replace_tag($t, $attr);
121 } elsif ($s->has_js_data) {
122 for my $t (@{$s->js_data}) {
123 # FIXME for this whole block
124 if ($t->[0]) {
125 printf $output "%s%s%s", $t->[2], find_translation $t->[3],
126 $t->[2];
127 } else {
128 print $output $t->[1];
131 } elsif (defined $t) {
132 # Quick fix to bug 4472
133 $t = "<!DOCTYPE stylesheet [" if $t =~ /DOCTYPE stylesheet/ ;
134 print $output $t;
139 sub listfiles {
140 my($dir, $type, $action) = @_;
141 my $filenames = join ('|', @filenames); # used to update strings from this file
142 my $match = join ('|', @match); # use only this files
143 my $nomatch = join ('|', @nomatch); # do no use this files
144 my @it = ();
145 if (opendir(DIR, $dir)) {
146 my @dirent = readdir DIR; # because DIR is shared when recursing
147 closedir DIR;
148 for my $dirent (@dirent) {
149 my $path = "$dir/$dirent";
150 if ($dirent =~ /^\./ || $dirent eq 'CVS' || $dirent eq 'RCS'
151 || (defined $exclude_regex && $dirent =~ /^(?:$exclude_regex)$/)) {
153 } elsif (-f $path) {
154 my $basename = fileparse( $path );
155 push @it, $path
156 if ( not @filenames or $basename =~ /($filenames)/i )
157 and ( not @match or $basename =~ /($match)/i ) # files to include
158 and ( not @nomatch or $basename !~ /($nomatch)/i ) # files not to include
159 and (!defined $type || $dirent =~ /\.(?:$type)$/) || $action eq 'install';
160 } elsif (-d $path && $recursive_p) {
161 push @it, listfiles($path, $type, $action);
164 } else {
165 warn_normal "$dir: $!", undef;
167 return @it;
170 ###############################################################################
172 sub mkdir_recursive ($) {
173 my($dir) = @_;
174 local($`, $&, $', $1);
175 $dir = $` if $dir ne /^\/+$/ && $dir =~ /\/+$/;
176 my ($prefix, $basename) = ($dir =~ /\/([^\/]+)$/s)? ($`, $1): ('.', $dir);
177 mkdir_recursive($prefix) if $prefix ne '.' && !-d $prefix;
178 if (!-d $dir) {
179 print STDERR "Making directory $dir...\n" unless $quiet;
180 # creates with rwxrwxr-x permissions
181 mkdir($dir, 0775) || warn_normal "$dir: $!", undef;
185 ###############################################################################
187 sub usage ($) {
188 my($exitcode) = @_;
189 my $h = $exitcode? *STDERR: *STDOUT;
190 print $h <<EOF;
191 Usage: $0 create [OPTION]
192 or: $0 update [OPTION]
193 or: $0 install [OPTION]
194 or: $0 --help
195 Create or update PO files from templates, or install translated templates.
197 -i, --input=SOURCE Get or update strings from SOURCE directory(s).
198 On create or update can have multiple values.
199 On install only one value.
200 -o, --outputdir=DIRECTORY Install translation(s) to specified DIRECTORY
201 --pedantic-warnings Issue warnings even for detected problems
202 which are likely to be harmless
203 -r, --recursive SOURCE in the -i option is a directory
204 -f, --filename=FILE FILE is a specific filename or part of it.
205 If given, only these files will be processed.
206 On update only relevant strings will be updated.
207 -m, --match=FILE FILE is a specific filename or part of it.
208 If given, only these files will be processed.
209 -n, --nomatch=FILE FILE is a specific filename or part of it.
210 If given, these files will not be processed.
211 -s, --str-file=FILE Specify FILE as the translation (po) file
212 for input (install) or output (create, update)
213 -x, --exclude=REGEXP Exclude dirs matching the given REGEXP
214 --help Display this help and exit
215 -q, --quiet no output to screen (except for errors)
217 The -o option is ignored for the "create" and "update" actions.
218 Try `perldoc $0` for perhaps more information.
220 exit($exitcode);
223 ###############################################################################
225 sub usage_error (;$) {
226 for my $msg (split(/\n/, $_[0])) {
227 print STDERR "$msg\n";
229 print STDERR "Try `$0 --help for more information.\n";
230 exit(-1);
233 ###############################################################################
235 GetOptions(
236 'input|i=s' => \@in_dirs,
237 'filename|f=s' => \@filenames,
238 'match|m=s' => \@match,
239 'nomatch|n=s' => \@nomatch,
240 'outputdir|o=s' => \$out_dir,
241 'recursive|r' => \$recursive_p,
242 'str-file|s=s' => \$str_file,
243 'exclude|x=s' => \@excludes,
244 'quiet|q' => \$quiet,
245 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
246 'help' => \&usage,
247 ) || usage_error;
249 VerboseWarnings::set_application_name $0;
250 VerboseWarnings::set_pedantic_mode $pedantic_p;
252 # keep the buggy Locale::PO quiet if it says stupid things
253 $SIG{__WARN__} = sub {
254 my($s) = @_;
255 print STDERR $s unless $s =~ /^Strange line in [^:]+: #~/s
258 my $action = shift or usage_error('You must specify an ACTION.');
259 usage_error('You must at least specify input and string list filenames.')
260 if !@in_dirs || !defined $str_file;
262 # Type match defaults to *.tt plus *.inc if not specified
263 $type = "tt|inc|xsl|xml|def" if !defined($type);
265 # Check the inputs for being directories
266 for my $in_dir ( @in_dirs ) {
267 usage_error("$in_dir: Input must be a directory.\n"
268 . "(Symbolic links are not supported at the moment)")
269 unless -d $in_dir;
272 # Generates the global exclude regular expression
273 $exclude_regex = '(?:'.join('|', @excludes).')' if @excludes;
275 my @in_files;
276 # Generate the list of input files if a directory is specified
277 # input is a directory, generates list of files to process
279 for my $fn ( @filenames ) {
280 die "You cannot specify input files and directories at the same time.\n"
281 if -d $fn;
283 for my $in_dir ( @in_dirs ) {
284 $in_dir =~ s/\/$//; # strips the trailing / if any
285 @in_files = ( @in_files, listfiles($in_dir, $type, $action));
288 # restores the string list from file
289 $href = Locale::PO->load_file_ashash($str_file);
291 # guess the charsets. HTML::Templates defaults to iso-8859-1
292 if (defined $href) {
293 die "$str_file: PO file is corrupted, or not a PO file\n" unless defined $href->{'""'};
294 $charset_out = TmplTokenizer::charset_canon $2 if $href->{'""'}->msgstr =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/;
295 $charset_in = $charset_out;
296 # for my $msgid (keys %$href) {
297 # if ($msgid =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/) {
298 # my $candidate = TmplTokenizer::charset_canon $2;
299 # die "Conflicting charsets in msgid: $charset_in vs $candidate => $msgid\n"
300 # if defined $charset_in && $charset_in ne $candidate;
301 # $charset_in = $candidate;
305 # BUG6464: check consistency of PO messages
306 # - count number of '%s' in msgid and msgstr
307 for my $msg ( values %$href ) {
308 my $id_count = split(/%s/, $msg->{msgid}) - 1;
309 my $str_count = split(/%s/, $msg->{msgstr}) - 1;
310 next if $id_count == $str_count ||
311 $msg->{msgstr} eq '""' ||
312 grep { /fuzzy/ } @{$msg->{_flags}};
313 warn_normal
314 "unconsistent %s count: ($id_count/$str_count):\n" .
315 " line: " . $msg->{loaded_line_number} . "\n" .
316 " msgid: " . $msg->{msgid} . "\n" .
317 " msgstr: " . $msg->{msgstr} . "\n", undef;
321 # set our charset in to UTF-8
322 if (!defined $charset_in) {
323 $charset_in = TmplTokenizer::charset_canon 'UTF-8';
324 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
326 # set our charset out to UTF-8
327 if (!defined $charset_out) {
328 $charset_out = TmplTokenizer::charset_canon 'UTF-8';
329 warn "Warning: Charset Out defaulting to $charset_out\n";
331 my $xgettext = './xgettext.pl'; # actual text extractor script
332 my $st;
334 if ($action eq 'create') {
335 # updates the list. As the list is empty, every entry will be added
336 if (!-s $str_file) {
337 warn "Removing empty file $str_file\n";
338 unlink $str_file || die "$str_file: $!\n";
340 die "$str_file: Output file already exists\n" if -f $str_file;
341 my($tmph1, $tmpfile1) = tmpnam();
342 my($tmph2, $tmpfile2) = tmpnam();
343 close $tmph2; # We just want a name
344 # Generate the temporary file that acts as <MODULE>/POTFILES.in
345 for my $input (@in_files) {
346 print $tmph1 "$input\n";
348 close $tmph1;
349 warn "I $charset_in O $charset_out";
350 # Generate the specified po file ($str_file)
351 $st = system ($xgettext, '-s', '-f', $tmpfile1, '-o', $tmpfile2,
352 (defined $charset_in? ('-I', $charset_in): ()),
353 (defined $charset_out? ('-O', $charset_out): ())
355 # Run msgmerge so that the pot file looks like a real pot file
356 # We need to help msgmerge a bit by pre-creating a dummy po file that has
357 # the headers and the "" msgid & msgstr. It will fill in the rest.
358 if ($st == 0) {
359 # Merge the temporary "pot file" with the specified po file ($str_file)
360 # FIXME: msgmerge(1) is a Unix dependency
361 # FIXME: need to check the return value
362 unless (-f $str_file) {
363 local(*INPUT, *OUTPUT);
364 open(INPUT, "<$tmpfile2");
365 open(OUTPUT, ">$str_file");
366 while (<INPUT>) {
367 print OUTPUT;
368 last if /^\n/s;
370 close INPUT;
371 close OUTPUT;
373 $st = system("msgmerge ".($quiet?'-q':'')." -s $str_file $tmpfile2 -o - | msgattrib --no-obsolete -o $str_file");
374 } else {
375 error_normal "Text extraction failed: $xgettext: $!\n", undef;
376 error_additional "Will not run msgmerge\n", undef;
378 unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef;
379 unlink $tmpfile2 || warn_normal "$tmpfile2: unlink failed: $!\n", undef;
381 } elsif ($action eq 'update') {
382 my($tmph1, $tmpfile1) = tmpnam();
383 my($tmph2, $tmpfile2) = tmpnam();
384 close $tmph2; # We just want a name
385 # Generate the temporary file that acts as <MODULE>/POTFILES.in
386 for my $input (@in_files) {
387 print $tmph1 "$input\n";
389 close $tmph1;
390 # Generate the temporary file that acts as <MODULE>/<LANG>.pot
391 $st = system($xgettext, '-s', '-f', $tmpfile1, '-o', $tmpfile2,
392 '--po-mode',
393 (defined $charset_in? ('-I', $charset_in): ()),
394 (defined $charset_out? ('-O', $charset_out): ()));
395 if ($st == 0) {
396 # Merge the temporary "pot file" with the specified po file ($str_file)
397 # FIXME: msgmerge(1) is a Unix dependency
398 # FIXME: need to check the return value
399 if ( @filenames ) {
400 my ($tmph3, $tmpfile3) = tmpnam();
401 $st = system("msgcat $str_file $tmpfile2 > $tmpfile3");
402 $st = system("msgmerge ".($quiet?'-q':'')." -s $str_file $tmpfile3 -o - | msgattrib --no-obsolete -o $str_file")
403 unless $st;
404 } else {
405 $st = system("msgmerge ".($quiet?'-q':'')." -s $str_file $tmpfile2 -o - | msgattrib --no-obsolete -o $str_file");
407 } else {
408 error_normal "Text extraction failed: $xgettext: $!\n", undef;
409 error_additional "Will not run msgmerge\n", undef;
411 unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef;
412 unlink $tmpfile2 || warn_normal "$tmpfile2: unlink failed: $!\n", undef;
414 } elsif ($action eq 'install') {
415 if(!defined($out_dir)) {
416 usage_error("You must specify an output directory when using the install method.");
419 if ( scalar @in_dirs > 1 ) {
420 usage_error("You must specify only one input directory when using the install method.");
423 my $in_dir = shift @in_dirs;
425 if ($in_dir eq $out_dir) {
426 warn "You must specify a different input and output directory.\n";
427 exit -1;
430 # Make sure the output directory exists
431 # (It will auto-create it, but for compatibility we should not)
432 -d $out_dir || die "$out_dir: The directory does not exist\n";
434 # Try to open the file, because Locale::PO doesn't check :-/
435 open(INPUT, "<$str_file") || die "$str_file: $!\n";
436 close INPUT;
438 # creates the new tmpl file using the new translation
439 for my $input (@in_files) {
440 die "Assertion failed"
441 unless substr($input, 0, length($in_dir) + 1) eq "$in_dir/";
443 my $target = $out_dir . substr($input, length($in_dir));
444 my $targetdir = $` if $target =~ /[^\/]+$/s;
446 if (!defined $type || $input =~ /\.(?:$type)$/) {
447 my $h = TmplTokenizer->new( $input );
448 $h->set_allow_cformat( 1 );
449 VerboseWarnings::set_input_file_name $input;
450 mkdir_recursive($targetdir) unless -d $targetdir;
451 print STDERR "Creating $target...\n" unless $quiet;
452 open( OUTPUT, ">$target" ) || die "$target: $!\n";
453 text_replace( $h, *OUTPUT );
454 close OUTPUT;
455 } else {
456 # just copying the file
457 mkdir_recursive($targetdir) unless -d $targetdir;
458 system("cp -f $input $target");
459 print STDERR "Copying $input...\n" unless $quiet;
463 } else {
464 usage_error('Unknown action specified.');
467 if ($st == 0) {
468 printf "The %s seems to be successful.\n", $action unless $quiet;
469 } else {
470 printf "%s FAILED.\n", "\u$action" unless $quiet;
472 exit 0;
474 ###############################################################################
476 =head1 SYNOPSIS
478 ./tmpl_process3.pl [ I<tmpl_process.pl options> ]
480 =head1 DESCRIPTION
482 This is an alternative version of the tmpl_process.pl script,
483 using standard gettext-style PO files. While there still might
484 be changes made to the way it extracts strings, at this moment
485 it should be stable enough for general use; it is already being
486 used for the Chinese and Polish translations.
488 Currently, the create, update, and install actions have all been
489 reimplemented and seem to work.
491 =head2 Features
493 =over
495 =item -
497 Translation files in standard Uniforum PO format.
498 All standard tools including all gettext tools,
499 plus PO file editors like kbabel(1) etc.
500 can be used.
502 =item -
504 Minor changes in whitespace in source templates
505 do not generally require strings to be re-translated.
507 =item -
509 Able to handle <TMPL_VAR> variables in the templates;
510 <TMPL_VAR> variables are usually extracted in proper context,
511 represented by a short %s placeholder.
513 =item -
515 Able to handle text input and radio button INPUT elements
516 in the templates; these INPUT elements are also usually
517 extracted in proper context,
518 represented by a short %S or %p placeholder.
520 =item -
522 Automatic comments in the generated PO files to provide
523 even more context (line numbers, and the names and types
524 of the variables).
526 =item -
528 The %I<n>$s (or %I<n>$p, etc.) notation can be used
529 for change the ordering of the variables,
530 if such a reordering is required for correct translation.
532 =item -
534 If a particular <TMPL_VAR> should not appear in the
535 translation, it can be suppressed with the %0.0s notation.
537 =item -
539 Using the PO format also means translators can add their
540 own comments in the translation files, if necessary.
542 =item -
544 Create, update, and install actions are all based on the
545 same scanner module. This ensures that update and install
546 have the same idea of what is a translatable string;
547 attribute names in tags, for example, will not be
548 accidentally translated.
550 =back
552 =head1 NOTES
554 Anchors are represented by an <AI<n>> notation.
555 The meaning of this non-standard notation might not be obvious.
557 The create action calls xgettext.pl to do the actual work;
558 the update action calls xgettext.pl, msgmerge(1) and msgattrib(1)
559 to do the actual work.
561 =head1 BUGS
563 xgettext.pl must be present in the current directory; both
564 msgmerge(1) and msgattrib(1) must also be present in the search path.
565 The script currently does not check carefully whether these
566 dependent commands are present.
568 Locale::PO(3) has a lot of bugs. It can neither parse nor
569 generate GNU PO files properly; a couple of workarounds have
570 been written in TmplTokenizer and more is likely to be needed
571 (e.g., to get rid of the "Strange line" warning for #~).
573 This script may not work in Windows.
575 There are probably some other bugs too, since this has not been
576 tested very much.
578 =head1 SEE ALSO
580 xgettext.pl,
581 TmplTokenizer.pm,
582 msgmerge(1),
583 Locale::PO(3),
584 translator_doc.txt
586 http://www.saas.nsw.edu.au/koha_wiki/index.php?page=DifficultTerms
588 =cut