Bug 7607: (follow-up) Address OPAC and limits
[koha.git] / misc / translator / tmpl_process3.pl
blob3c046189150bd4e6fe6e25f026ce444d7aead45f
1 #!/usr/bin/perl
2 # This file is part of Koha
3 # Parts copyright 2003-2004 Paul Poulain
4 # Parts copyright 2003-2004 Jerome Vizcaino
5 # Parts copyright 2004 Ambrose Li
7 use FindBin;
8 use lib $FindBin::Bin;
10 =head1 NAME
12 tmpl_process3.pl - Alternative version of tmpl_process.pl
13 using gettext-compatible translation files
15 =cut
17 use strict;
18 #use warnings; FIXME - Bug 2505
19 use File::Basename;
20 use Getopt::Long;
21 use Locale::PO;
22 use File::Temp qw( :POSIX );
23 use TmplTokenizer;
24 use VerboseWarnings qw( :warn :die );
26 ###############################################################################
28 use vars qw( @in_dirs @filenames @match @nomatch $str_file $out_dir $quiet );
29 use vars qw( @excludes $exclude_regex );
30 use vars qw( $recursive_p );
31 use vars qw( $pedantic_p );
32 use vars qw( $href );
33 use vars qw( $type ); # file extension (DOS form without the dot) to match
34 use vars qw( $charset_in $charset_out );
36 ###############################################################################
38 sub find_translation {
39 my($s) = @_;
40 my $key = $s;
41 if ($s =~ /\S/s) {
42 $key = TmplTokenizer::string_canon($key);
43 $key = TmplTokenizer::charset_convert($key, $charset_in, $charset_out);
44 $key = TmplTokenizer::quote_po($key);
46 if (defined $href->{$key} && !$href->{$key}->fuzzy && length Locale::PO->dequote($href->{$key}->msgstr)){
47 if ($s =~ /^(\s+)/){
48 return $1 . Locale::PO->dequote($href->{$key}->msgstr);
50 else {
51 return Locale::PO->dequote($href->{$key}->msgstr);
54 else {
55 return $s;
59 sub text_replace_tag {
60 my($t, $attr) = @_;
61 my $it;
62 my @ttvar;
64 # value [tag=input], meta
65 my $tag = ($t =~ /^<(\S+)/s) ? lc($1) : undef;
66 my $translated_p = 0;
67 for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') {
68 if ($attr->{$a}) {
69 next if $a eq 'label' && $tag ne 'optgroup';
70 next if $a eq 'content' && $tag ne 'meta';
71 next if $a eq 'value' && ($tag ne 'input' || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:checkbox|hidden|radio)$/)); # FIXME
73 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
74 if ($val =~ /\S/s) {
75 # for selected attributes replace '[%..%]' with '%s' and remember matches
76 if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) {
77 while ( $val =~ s/(\[\%.*?\%\])/\%s/ ) {
78 my $var = $1;
79 push @ttvar, $1;
82 # find translation for transformed attributes
83 my $s = find_translation($val);
84 # replace '%s' with original content (in order) on translated string, this is fragile!
85 if ( $a =~ /title|value|alt|content|placeholder|aria-label/ and @ttvar ) {
86 while ( @ttvar ) {
87 my $var = shift @ttvar;
88 $s =~ s/\%s/$var/;
91 if ($attr->{$a}->[1] ne $s) { #FIXME
92 $attr->{$a}->[1] = $s; # FIXME
93 $attr->{$a}->[2] = ($s =~ /"/s)? "'$s'": "\"$s\""; #FIXME
94 $translated_p = 1;
99 if ($translated_p) {
100 $it = "<$tag"
101 . join('', map { if ($_ ne '/'){
102 sprintf(' %s="%s"', $_, $attr->{$_}->[1]);
104 else {
105 sprintf(' %s',$_);
108 } sort {
109 $attr->{$a}->[3] <=> $attr->{$b}->[3] #FIXME
110 || $a cmp $b # Sort attributes BZ 22236
111 } keys %$attr);
112 $it .= '>';
114 else {
115 $it = $t;
117 return $it;
120 sub text_replace {
121 my($h, $output) = @_;
122 for (;;) {
123 my $s = TmplTokenizer::next_token($h);
124 last unless defined $s;
125 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
126 if ($kind eq C4::TmplTokenType::TEXT) {
127 print $output find_translation($t);
128 } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) {
129 my $fmt = find_translation($s->form);
130 print $output TmplTokenizer::parametrize($fmt, 1, $s, sub {
131 $_ = $_[0];
132 my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
133 $kind == C4::TmplTokenType::TAG && %$attr?
134 text_replace_tag($t, $attr): $t });
135 } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) {
136 print $output text_replace_tag($t, $attr);
137 } elsif ($s->has_js_data) {
138 for my $t (@{$s->js_data}) {
139 # FIXME for this whole block
140 if ($t->[0]) {
141 printf $output "%s%s%s", $t->[2], find_translation($t->[3]),
142 $t->[2];
143 } else {
144 print $output $t->[1];
147 } elsif (defined $t) {
148 # Quick fix to bug 4472
149 $t = "<!DOCTYPE stylesheet [" if $t =~ /DOCTYPE stylesheet/ ;
150 print $output $t;
155 sub listfiles {
156 my($dir, $type, $action) = @_;
157 my $filenames = join ('|', @filenames); # used to update strings from this file
158 my $match = join ('|', @match); # use only this files
159 my $nomatch = join ('|', @nomatch); # do no use this files
160 my @it = ();
161 if (opendir(DIR, $dir)) {
162 my @dirent = readdir DIR; # because DIR is shared when recursing
163 closedir DIR;
164 for my $dirent (@dirent) {
165 my $path = "$dir/$dirent";
166 if ($dirent =~ /^\./ || $dirent eq 'CVS' || $dirent eq 'RCS'
167 || (defined $exclude_regex && $dirent =~ /^(?:$exclude_regex)$/)) {
169 } elsif (-f $path) {
170 my $basename = fileparse( $path );
171 push @it, $path
172 if ( not @filenames or $basename =~ /($filenames)/i )
173 and ( not @match or $basename =~ /($match)/i ) # files to include
174 and ( not @nomatch or $basename !~ /($nomatch)/i ) # files not to include
175 and (!defined $type || $dirent =~ /\.(?:$type)$/) || $action eq 'install';
176 } elsif (-d $path && $recursive_p) {
177 push @it, listfiles($path, $type, $action);
180 } else {
181 warn_normal("$dir: $!", undef);
183 return @it;
186 ###############################################################################
188 sub mkdir_recursive {
189 my($dir) = @_;
190 local($`, $&, $', $1);
191 $dir = $` if $dir ne /^\/+$/ && $dir =~ /\/+$/;
192 my ($prefix, $basename) = ($dir =~ /\/([^\/]+)$/s)? ($`, $1): ('.', $dir);
193 mkdir_recursive($prefix) if $prefix ne '.' && !-d $prefix;
194 if (!-d $dir) {
195 print STDERR "Making directory $dir...\n" unless $quiet;
196 # creates with rwxrwxr-x permissions
197 mkdir($dir, 0775) || warn_normal("$dir: $!", undef);
201 ###############################################################################
203 sub usage {
204 my($exitcode) = @_;
205 my $h = $exitcode? *STDERR: *STDOUT;
206 print $h <<EOF;
207 Usage: $0 install [OPTION]
208 or: $0 --help
209 Install translated templates.
211 -i, --input=SOURCE Get or update strings from SOURCE directory(s).
212 On create or update can have multiple values.
213 On install only one value.
214 -o, --outputdir=DIRECTORY Install translation(s) to specified DIRECTORY
215 --pedantic-warnings Issue warnings even for detected problems
216 which are likely to be harmless
217 -r, --recursive SOURCE in the -i option is a directory
218 -f, --filename=FILE FILE is a specific filename or part of it.
219 If given, only these files will be processed.
220 On update only relevant strings will be updated.
221 -m, --match=FILE FILE is a specific filename or part of it.
222 If given, only these files will be processed.
223 -n, --nomatch=FILE FILE is a specific filename or part of it.
224 If given, these files will not be processed.
225 -s, --str-file=FILE Specify FILE as the translation (po) file
226 for input (install) or output (create, update)
227 -x, --exclude=REGEXP Exclude dirs matching the given REGEXP
228 --help Display this help and exit
229 -q, --quiet no output to screen (except for errors)
231 Try `perldoc $0` for perhaps more information.
233 exit($exitcode);
236 ###############################################################################
238 sub usage_error {
239 for my $msg (split(/\n/, $_[0])) {
240 print STDERR "$msg\n";
242 print STDERR "Try `$0 --help for more information.\n";
243 exit(-1);
246 ###############################################################################
248 GetOptions(
249 'input|i=s' => \@in_dirs,
250 'filename|f=s' => \@filenames,
251 'match|m=s' => \@match,
252 'nomatch|n=s' => \@nomatch,
253 'outputdir|o=s' => \$out_dir,
254 'recursive|r' => \$recursive_p,
255 'str-file|s=s' => \$str_file,
256 'exclude|x=s' => \@excludes,
257 'quiet|q' => \$quiet,
258 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
259 'help' => \&usage,
260 ) || usage_error();
262 VerboseWarnings::set_application_name($0);
263 VerboseWarnings::set_pedantic_mode($pedantic_p);
265 my $action = shift or usage_error('You must specify an ACTION.');
266 usage_error('You must at least specify input and string list filenames.')
267 if !@in_dirs || !defined $str_file;
269 # Type match defaults to *.tt plus *.inc if not specified
270 $type = "tt|inc|xsl|xml|def" if !defined($type);
272 # Check the inputs for being directories
273 for my $in_dir ( @in_dirs ) {
274 usage_error("$in_dir: Input must be a directory.\n"
275 . "(Symbolic links are not supported at the moment)")
276 unless -d $in_dir;
279 # Generates the global exclude regular expression
280 $exclude_regex = '(?:'.join('|', @excludes).')' if @excludes;
282 my @in_files;
283 # Generate the list of input files if a directory is specified
284 # input is a directory, generates list of files to process
286 for my $fn ( @filenames ) {
287 die "You cannot specify input files and directories at the same time.\n"
288 if -d $fn;
290 for my $in_dir ( @in_dirs ) {
291 $in_dir =~ s/\/$//; # strips the trailing / if any
292 @in_files = ( @in_files, listfiles($in_dir, $type, $action));
295 # restores the string list from file
296 $href = Locale::PO->load_file_ashash($str_file, 'utf-8');
298 # guess the charsets. HTML::Templates defaults to iso-8859-1
299 if (defined $href) {
300 die "$str_file: PO file is corrupted, or not a PO file\n" unless defined $href->{'""'};
301 $charset_out = TmplTokenizer::charset_canon($2) if $href->{'""'}->msgstr =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/;
302 $charset_in = $charset_out;
303 # for my $msgid (keys %$href) {
304 # if ($msgid =~ /\bcharset=(["']?)([^;\s"'\\]+)\1/) {
305 # my $candidate = TmplTokenizer::charset_canon $2;
306 # die "Conflicting charsets in msgid: $charset_in vs $candidate => $msgid\n"
307 # if defined $charset_in && $charset_in ne $candidate;
308 # $charset_in = $candidate;
312 # BUG6464: check consistency of PO messages
313 # - count number of '%s' in msgid and msgstr
314 for my $msg ( values %$href ) {
315 my $id_count = split(/%s/, $msg->{msgid}) - 1;
316 my $str_count = split(/%s/, $msg->{msgstr}) - 1;
317 next if $id_count == $str_count ||
318 $msg->{msgstr} eq '""' ||
319 grep { /fuzzy/ } @{$msg->{_flags}};
320 warn_normal(
321 "unconsistent %s count: ($id_count/$str_count):\n" .
322 " line: " . $msg->{loaded_line_number} . "\n" .
323 " msgid: " . $msg->{msgid} . "\n" .
324 " msgstr: " . $msg->{msgstr} . "\n", undef);
328 # set our charset in to UTF-8
329 if (!defined $charset_in) {
330 $charset_in = TmplTokenizer::charset_canon('UTF-8');
331 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n" unless ( $quiet );
333 # set our charset out to UTF-8
334 if (!defined $charset_out) {
335 $charset_out = TmplTokenizer::charset_canon('UTF-8');
336 warn "Warning: Charset Out defaulting to $charset_out\n" unless ( $quiet );
338 my $st;
340 if ($action eq 'install') {
341 if(!defined($out_dir)) {
342 usage_error("You must specify an output directory when using the install method.");
345 if ( scalar @in_dirs > 1 ) {
346 usage_error("You must specify only one input directory when using the install method.");
349 my $in_dir = shift @in_dirs;
351 if ($in_dir eq $out_dir) {
352 warn "You must specify a different input and output directory.\n";
353 exit -1;
356 # Make sure the output directory exists
357 # (It will auto-create it, but for compatibility we should not)
358 -d $out_dir || die "$out_dir: The directory does not exist\n";
360 # Try to open the file, because Locale::PO doesn't check :-/
361 open(my $fh, '<', $str_file) || die "$str_file: $!\n";
362 close $fh;
364 # creates the new tmpl file using the new translation
365 for my $input (@in_files) {
366 die "Assertion failed"
367 unless substr($input, 0, length($in_dir) + 1) eq "$in_dir/";
369 my $target = $out_dir . substr($input, length($in_dir));
370 my $targetdir = ($target =~ /[^\/]+$/s) ? $` : undef;
372 if (!defined $type || $input =~ /\.(?:$type)$/) {
373 my $h = TmplTokenizer->new( $input );
374 $h->set_allow_cformat( 1 );
375 VerboseWarnings::set_input_file_name($input);
376 mkdir_recursive($targetdir) unless -d $targetdir;
377 print STDERR "Creating $target...\n" unless $quiet;
378 open( my $fh, ">:encoding(UTF-8)", "$target" ) || die "$target: $!\n";
379 text_replace( $h, $fh );
380 close $fh;
381 } else {
382 # just copying the file
383 mkdir_recursive($targetdir) unless -d $targetdir;
384 system("cp -f $input $target");
385 print STDERR "Copying $input...\n" unless $quiet;
389 } else {
390 usage_error('Unknown action specified.');
393 if ($st == 0) {
394 printf "The %s seems to be successful.\n", $action unless $quiet;
395 } else {
396 printf "%s FAILED.\n", "\u$action" unless $quiet;
398 exit 0;
400 ###############################################################################
402 =head1 SYNOPSIS
404 ./tmpl_process3.pl [ I<tmpl_process.pl options> ]
406 =head1 DESCRIPTION
408 This is an alternative version of the tmpl_process.pl script,
409 using standard gettext-style PO files. While there still might
410 be changes made to the way it extracts strings, at this moment
411 it should be stable enough for general use; it is already being
412 used for the Chinese and Polish translations.
414 Currently, the create, update, and install actions have all been
415 reimplemented and seem to work.
417 =head2 Features
419 =over
421 =item -
423 Translation files in standard Uniforum PO format.
424 All standard tools including all gettext tools,
425 plus PO file editors like kbabel(1) etc.
426 can be used.
428 =item -
430 Minor changes in whitespace in source templates
431 do not generally require strings to be re-translated.
433 =item -
435 Able to handle <TMPL_VAR> variables in the templates;
436 <TMPL_VAR> variables are usually extracted in proper context,
437 represented by a short %s placeholder.
439 =item -
441 Able to handle text input and radio button INPUT elements
442 in the templates; these INPUT elements are also usually
443 extracted in proper context,
444 represented by a short %S or %p placeholder.
446 =item -
448 Automatic comments in the generated PO files to provide
449 even more context (line numbers, and the names and types
450 of the variables).
452 =item -
454 The %I<n>$s (or %I<n>$p, etc.) notation can be used
455 for change the ordering of the variables,
456 if such a reordering is required for correct translation.
458 =item -
460 If a particular <TMPL_VAR> should not appear in the
461 translation, it can be suppressed with the %0.0s notation.
463 =item -
465 Using the PO format also means translators can add their
466 own comments in the translation files, if necessary.
468 =back
470 =head1 NOTES
472 Anchors are represented by an <AI<n>> notation.
473 The meaning of this non-standard notation might not be obvious.
475 =head1 BUGS
477 This script may not work in Windows.
479 There are probably some other bugs too, since this has not been
480 tested very much.
482 =head1 SEE ALSO
484 TmplTokenizer.pm,
485 Locale::PO(3),
487 =cut