Bug 9098 Replace tabulations by spaces in opac-user.pl
[koha.git] / misc / translator / tmpl_process3.pl
blob89c993f672ad230de0aebbdb692e82b62fd772fb
1 #!/usr/bin/perl
2 # This file is part of Koha
3 # Parts copyright 2003-2004 Paul Poulain
4 # Parts copyright 2003-2004 Jerome Vizcaino
5 # Parts copyright 2004 Ambrose Li
7 =head1 NAME
9 tmpl_process3.pl - Alternative version of tmpl_process.pl
10 using gettext-compatible translation files
12 =cut
14 use strict;
15 #use warnings; FIXME - Bug 2505
16 use Getopt::Long;
17 use Locale::PO;
18 use File::Temp qw( :POSIX );
19 use TmplTokenizer;
20 use VerboseWarnings qw( :warn :die );
22 ###############################################################################
24 use vars qw( @in_files $in_dir $str_file $out_dir $quiet );
25 use vars qw( @excludes $exclude_regex );
26 use vars qw( $recursive_p );
27 use vars qw( $pedantic_p );
28 use vars qw( $href );
29 use vars qw( $type ); # file extension (DOS form without the dot) to match
30 use vars qw( $charset_in $charset_out );
32 ###############################################################################
34 sub find_translation ($) {
35 my($s) = @_;
36 my $key = $s;
37 if ($s =~ /\S/s) {
38 $key = TmplTokenizer::string_canon($key);
39 $key = TmplTokenizer::charset_convert($key, $charset_in, $charset_out);
40 $key = TmplTokenizer::quote_po($key);
42 if (defined $href->{$key} && !$href->{$key}->fuzzy && length Locale::PO->dequote($href->{$key}->msgstr)){
43 if ($s =~ /^(\s+)/){
44 return $1 . Locale::PO->dequote($href->{$key}->msgstr);
46 else {
47 return Locale::PO->dequote($href->{$key}->msgstr);
50 else {
51 return $s;
55 sub text_replace_tag ($$) {
56 my($t, $attr) = @_;
57 my $it;
59 # value [tag=input], meta
60 my $tag = lc($1) if $t =~ /^<(\S+)/s;
61 my $translated_p = 0;
62 for my $a ('alt', 'content', 'title', 'value','label') {
63 if ($attr->{$a}) {
64 next if $a eq 'label' && $tag ne 'optgroup';
65 next if $a eq 'content' && $tag ne 'meta';
66 next if $a eq 'value' && ($tag ne 'input' || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:checkbox|hidden|radio|text)$/)); # FIXME
68 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
69 if ($val =~ /\S/s) {
70 my $s = find_translation($val);
71 if ($attr->{$a}->[1] ne $s) { #FIXME
72 $attr->{$a}->[1] = $s; # FIXME
73 $attr->{$a}->[2] = ($s =~ /"/s)? "'$s'": "\"$s\""; #FIXME
74 $translated_p = 1;
79 if ($translated_p) {
80 $it = "<$tag"
81 . join('', map { if ($_ ne '/'){
82 sprintf(' %s="%s"', $_, $attr->{$_}->[1]);
84 else {
85 sprintf(' %s',$_);
88 } sort {
89 $attr->{$a}->[3] <=> $attr->{$b}->[3] #FIXME
90 } keys %$attr);
91 $it .= '>';
93 else {
94 $it = $t;
96 return $it;
99 sub text_replace (**) {
100 my($h, $output) = @_;
101 for (;;) {
102 my $s = TmplTokenizer::next_token $h;
103 last unless defined $s;
104 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
105 if ($kind eq C4::TmplTokenType::TEXT) {
106 print $output find_translation($t);
107 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
108 my $fmt = find_translation($s->form);
109 print $output TmplTokenizer::parametrize($fmt, 1, $s, sub {
110 $_ = $_[0];
111 my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
112 $kind == C4::TmplTokenType::TAG && %$attr?
113 text_replace_tag($t, $attr): $t });
114 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
115 print $output text_replace_tag($t, $attr);
116 } elsif ($s->has_js_data) {
117 for my $t (@{$s->js_data}) {
118 # FIXME for this whole block
119 if ($t->[0]) {
120 printf $output "%s%s%s", $t->[2], find_translation $t->[3],
121 $t->[2];
122 } else {
123 print $output $t->[1];
126 } elsif (defined $t) {
127 # Quick fix to bug 4472
128 $t = "<!DOCTYPE stylesheet [" if $t =~ /DOCTYPE stylesheet/ ;
129 print $output $t;
134 sub listfiles ($$$) {
135 my($dir, $type, $action) = @_;
136 my @it = ();
137 if (opendir(DIR, $dir)) {
138 my @dirent = readdir DIR; # because DIR is shared when recursing
139 closedir DIR;
140 for my $dirent (@dirent) {
141 my $path = "$dir/$dirent";
142 if ($dirent =~ /^\./ || $dirent eq 'CVS' || $dirent eq 'RCS'
143 || (defined $exclude_regex && $dirent =~ /^(?:$exclude_regex)$/)) {
145 } elsif (-f $path) {
146 push @it, $path if (!defined $type || $dirent =~ /\.(?:$type)$/) || $action eq 'install';
147 } elsif (-d $path && $recursive_p) {
148 push @it, listfiles($path, $type, $action);
151 } else {
152 warn_normal "$dir: $!", undef;
154 return @it;
157 ###############################################################################
159 sub mkdir_recursive ($) {
160 my($dir) = @_;
161 local($`, $&, $', $1);
162 $dir = $` if $dir ne /^\/+$/ && $dir =~ /\/+$/;
163 my ($prefix, $basename) = ($dir =~ /\/([^\/]+)$/s)? ($`, $1): ('.', $dir);
164 mkdir_recursive($prefix) if $prefix ne '.' && !-d $prefix;
165 if (!-d $dir) {
166 print STDERR "Making directory $dir..." unless $quiet;
167 # creates with rwxrwxr-x permissions
168 mkdir($dir, 0775) || warn_normal "$dir: $!", undef;
172 ###############################################################################
174 sub usage ($) {
175 my($exitcode) = @_;
176 my $h = $exitcode? *STDERR: *STDOUT;
177 print $h <<EOF;
178 Usage: $0 create [OPTION]
179 or: $0 update [OPTION]
180 or: $0 install [OPTION]
181 or: $0 --help
182 Create or update PO files from templates, or install translated templates.
184 -i, --input=SOURCE Get or update strings from SOURCE file.
185 SOURCE is a directory if -r is also specified.
186 -o, --outputdir=DIRECTORY Install translation(s) to specified DIRECTORY
187 --pedantic-warnings Issue warnings even for detected problems
188 which are likely to be harmless
189 -r, --recursive SOURCE in the -i option is a directory
190 -s, --str-file=FILE Specify FILE as the translation (po) file
191 for input (install) or output (create, update)
192 -x, --exclude=REGEXP Exclude files matching the given REGEXP
193 --help Display this help and exit
194 -q, --quiet no output to screen (except for errors)
196 The -o option is ignored for the "create" and "update" actions.
197 Try `perldoc $0 for perhaps more information.
199 exit($exitcode);
202 ###############################################################################
204 sub usage_error (;$) {
205 for my $msg (split(/\n/, $_[0])) {
206 print STDERR "$msg\n";
208 print STDERR "Try `$0 --help for more information.\n";
209 exit(-1);
212 ###############################################################################
214 GetOptions(
215 'input|i=s' => \@in_files,
216 'outputdir|o=s' => \$out_dir,
217 'recursive|r' => \$recursive_p,
218 'str-file|s=s' => \$str_file,
219 'exclude|x=s' => \@excludes,
220 'quiet|q' => \$quiet,
221 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
222 'help' => \&usage,
223 ) || usage_error;
225 VerboseWarnings::set_application_name $0;
226 VerboseWarnings::set_pedantic_mode $pedantic_p;
228 # keep the buggy Locale::PO quiet if it says stupid things
229 $SIG{__WARN__} = sub {
230 my($s) = @_;
231 print STDERR $s unless $s =~ /^Strange line in [^:]+: #~/s
234 my $action = shift or usage_error('You must specify an ACTION.');
235 usage_error('You must at least specify input and string list filenames.')
236 if !@in_files || !defined $str_file;
238 # Type match defaults to *.tt plus *.inc if not specified
239 $type = "tt|inc|xsl" if !defined($type);
241 # Check the inputs for being files or directories
242 for my $input (@in_files) {
243 usage_error("$input: Input must be a file or directory.\n"
244 . "(Symbolic links are not supported at the moment)")
245 unless -d $input || -f $input;;
248 # Generates the global exclude regular expression
249 $exclude_regex = '(?:'.join('|', @excludes).')' if @excludes;
251 # Generate the list of input files if a directory is specified
252 if (-d $in_files[0]) {
253 die "If you specify a directory as input, you must specify only it.\n"
254 if @in_files > 1;
256 # input is a directory, generates list of files to process
257 $in_dir = $in_files[0];
258 $in_dir =~ s/\/$//; # strips the trailing / if any
259 @in_files = listfiles($in_dir, $type, $action);
260 } else {
261 for my $input (@in_files) {
262 die "You cannot specify input files and directories at the same time.\n"
263 unless -f $input;
267 # restores the string list from file
268 $href = Locale::PO->load_file_ashash($str_file);
270 # guess the charsets. HTML::Templates defaults to iso-8859-1
271 if (defined $href) {
272 die "$str_file: PO file is corrupted, or not a PO file\n" unless defined $href->{'""'};
273 $charset_out = TmplTokenizer::charset_canon $2 if $href->{'""'}->msgstr =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/;
274 $charset_in = $charset_out;
275 # for my $msgid (keys %$href) {
276 # if ($msgid =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/) {
277 # my $candidate = TmplTokenizer::charset_canon $2;
278 # die "Conflicting charsets in msgid: $charset_in vs $candidate => $msgid\n"
279 # if defined $charset_in && $charset_in ne $candidate;
280 # $charset_in = $candidate;
284 # BUG6464: check consistency of PO messages
285 # - count number of '%s' in msgid and msgstr
286 for my $msg ( values %$href ) {
287 my $id_count = split(/%s/, $msg->{msgid}) - 1;
288 my $str_count = split(/%s/, $msg->{msgstr}) - 1;
289 next if $id_count == $str_count ||
290 $msg->{msgstr} eq '""' ||
291 grep { /fuzzy/ } @{$msg->{_flags}};
292 warn_normal
293 "unconsistent %s count: ($id_count/$str_count):\n" .
294 " line: " . $msg->{loaded_line_number} . "\n" .
295 " msgid: " . $msg->{msgid} . "\n" .
296 " msgstr: " . $msg->{msgstr} . "\n", undef;
300 # set our charset in to UTF-8
301 if (!defined $charset_in) {
302 $charset_in = TmplTokenizer::charset_canon 'UTF-8';
303 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
305 # set our charset out to UTF-8
306 if (!defined $charset_out) {
307 $charset_out = TmplTokenizer::charset_canon 'UTF-8';
308 warn "Warning: Charset Out defaulting to $charset_out\n";
310 my $xgettext = './xgettext.pl'; # actual text extractor script
311 my $st;
313 if ($action eq 'create') {
314 # updates the list. As the list is empty, every entry will be added
315 if (!-s $str_file) {
316 warn "Removing empty file $str_file\n";
317 unlink $str_file || die "$str_file: $!\n";
319 die "$str_file: Output file already exists\n" if -f $str_file;
320 my($tmph1, $tmpfile1) = tmpnam();
321 my($tmph2, $tmpfile2) = tmpnam();
322 close $tmph2; # We just want a name
323 # Generate the temporary file that acts as <MODULE>/POTFILES.in
324 for my $input (@in_files) {
325 print $tmph1 "$input\n";
327 close $tmph1;
328 warn "I $charset_in O $charset_out";
329 # Generate the specified po file ($str_file)
330 $st = system ($xgettext, '-s', '-f', $tmpfile1, '-o', $tmpfile2,
331 (defined $charset_in? ('-I', $charset_in): ()),
332 (defined $charset_out? ('-O', $charset_out): ())
334 # Run msgmerge so that the pot file looks like a real pot file
335 # We need to help msgmerge a bit by pre-creating a dummy po file that has
336 # the headers and the "" msgid & msgstr. It will fill in the rest.
337 if ($st == 0) {
338 # Merge the temporary "pot file" with the specified po file ($str_file)
339 # FIXME: msgmerge(1) is a Unix dependency
340 # FIXME: need to check the return value
341 unless (-f $str_file) {
342 local(*INPUT, *OUTPUT);
343 open(INPUT, "<$tmpfile2");
344 open(OUTPUT, ">$str_file");
345 while (<INPUT>) {
346 print OUTPUT;
347 last if /^\n/s;
349 close INPUT;
350 close OUTPUT;
352 $st = system("msgmerge -U ".($quiet?'-q':'')." -s $str_file $tmpfile2");
353 } else {
354 error_normal "Text extraction failed: $xgettext: $!\n", undef;
355 error_additional "Will not run msgmerge\n", undef;
357 # unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef;
358 # unlink $tmpfile2 || warn_normal "$tmpfile2: unlink failed: $!\n", undef;
360 } elsif ($action eq 'update') {
361 my($tmph1, $tmpfile1) = tmpnam();
362 my($tmph2, $tmpfile2) = tmpnam();
363 close $tmph2; # We just want a name
364 # Generate the temporary file that acts as <MODULE>/POTFILES.in
365 for my $input (@in_files) {
366 print $tmph1 "$input\n";
368 close $tmph1;
369 # Generate the temporary file that acts as <MODULE>/<LANG>.pot
370 $st = system($xgettext, '-s', '-f', $tmpfile1, '-o', $tmpfile2,
371 '--po-mode',
372 (defined $charset_in? ('-I', $charset_in): ()),
373 (defined $charset_out? ('-O', $charset_out): ()));
374 if ($st == 0) {
375 # Merge the temporary "pot file" with the specified po file ($str_file)
376 # FIXME: msgmerge(1) is a Unix dependency
377 # FIXME: need to check the return value
378 $st = system("msgmerge -U ".($quiet?'-q':'')." -s $str_file $tmpfile2");
379 } else {
380 error_normal "Text extraction failed: $xgettext: $!\n", undef;
381 error_additional "Will not run msgmerge\n", undef;
383 # unlink $tmpfile1 || warn_normal "$tmpfile1: unlink failed: $!\n", undef;
384 # unlink $tmpfile2 || warn_normal "$tmpfile2: unlink failed: $!\n", undef;
386 } elsif ($action eq 'install') {
387 if(!defined($out_dir)) {
388 usage_error("You must specify an output directory when using the install method.");
391 if ($in_dir eq $out_dir) {
392 warn "You must specify a different input and output directory.\n";
393 exit -1;
396 # Make sure the output directory exists
397 # (It will auto-create it, but for compatibility we should not)
398 -d $out_dir || die "$out_dir: The directory does not exist\n";
400 # Try to open the file, because Locale::PO doesn't check :-/
401 open(INPUT, "<$str_file") || die "$str_file: $!\n";
402 close INPUT;
404 # creates the new tmpl file using the new translation
405 for my $input (@in_files) {
406 die "Assertion failed"
407 unless substr($input, 0, length($in_dir) + 1) eq "$in_dir/";
408 # print "$input / $type\n";
409 if (!defined $type || $input =~ /\.(?:$type)$/) {
410 my $h = TmplTokenizer->new( $input );
411 $h->set_allow_cformat( 1 );
412 VerboseWarnings::set_input_file_name $input;
414 my $target = $out_dir . substr($input, length($in_dir));
415 my $targetdir = $` if $target =~ /[^\/]+$/s;
416 mkdir_recursive($targetdir) unless -d $targetdir;
417 print STDERR "Creating $target...\n" unless $quiet;
418 open( OUTPUT, ">$target" ) || die "$target: $!\n";
419 text_replace( $h, *OUTPUT );
420 close OUTPUT;
421 } else {
422 # just copying the file
423 my $target = $out_dir . substr($input, length($in_dir));
424 my $targetdir = $` if $target =~ /[^\/]+$/s;
425 mkdir_recursive($targetdir) unless -d $targetdir;
426 system("cp -f $input $target");
427 print STDERR "Copying $input...\n" unless $quiet;
431 } else {
432 usage_error('Unknown action specified.');
435 if ($st == 0) {
436 printf "The %s seems to be successful.\n", $action unless $quiet;
437 } else {
438 printf "%s FAILED.\n", "\u$action" unless $quiet;
440 exit 0;
442 ###############################################################################
444 =head1 SYNOPSIS
446 ./tmpl_process3.pl [ I<tmpl_process.pl options> ]
448 =head1 DESCRIPTION
450 This is an alternative version of the tmpl_process.pl script,
451 using standard gettext-style PO files. While there still might
452 be changes made to the way it extracts strings, at this moment
453 it should be stable enough for general use; it is already being
454 used for the Chinese and Polish translations.
456 Currently, the create, update, and install actions have all been
457 reimplemented and seem to work.
459 =head2 Features
461 =over
463 =item -
465 Translation files in standard Uniforum PO format.
466 All standard tools including all gettext tools,
467 plus PO file editors like kbabel(1) etc.
468 can be used.
470 =item -
472 Minor changes in whitespace in source templates
473 do not generally require strings to be re-translated.
475 =item -
477 Able to handle <TMPL_VAR> variables in the templates;
478 <TMPL_VAR> variables are usually extracted in proper context,
479 represented by a short %s placeholder.
481 =item -
483 Able to handle text input and radio button INPUT elements
484 in the templates; these INPUT elements are also usually
485 extracted in proper context,
486 represented by a short %S or %p placeholder.
488 =item -
490 Automatic comments in the generated PO files to provide
491 even more context (line numbers, and the names and types
492 of the variables).
494 =item -
496 The %I<n>$s (or %I<n>$p, etc.) notation can be used
497 for change the ordering of the variables,
498 if such a reordering is required for correct translation.
500 =item -
502 If a particular <TMPL_VAR> should not appear in the
503 translation, it can be suppressed with the %0.0s notation.
505 =item -
507 Using the PO format also means translators can add their
508 own comments in the translation files, if necessary.
510 =item -
512 Create, update, and install actions are all based on the
513 same scanner module. This ensures that update and install
514 have the same idea of what is a translatable string;
515 attribute names in tags, for example, will not be
516 accidentally translated.
518 =back
520 =head1 NOTES
522 Anchors are represented by an <AI<n>> notation.
523 The meaning of this non-standard notation might not be obvious.
525 The create action calls xgettext.pl to do the actual work;
526 the update action calls xgettext.pl and msgmerge(1) to do the
527 actual work.
529 =head1 BUGS
531 xgettext.pl must be present in the current directory; the
532 msgmerge(1) command must also be present in the search path.
533 The script currently does not check carefully whether these
534 dependent commands are present.
536 Locale::PO(3) has a lot of bugs. It can neither parse nor
537 generate GNU PO files properly; a couple of workarounds have
538 been written in TmplTokenizer and more is likely to be needed
539 (e.g., to get rid of the "Strange line" warning for #~).
541 This script may not work in Windows.
543 There are probably some other bugs too, since this has not been
544 tested very much.
546 =head1 SEE ALSO
548 xgettext.pl,
549 TmplTokenizer.pm,
550 msgmerge(1),
551 Locale::PO(3),
552 translator_doc.txt
554 http://www.saas.nsw.edu.au/koha_wiki/index.php?page=DifficultTerms
556 =cut