3 # Test filter partially based on Ambrose's hideous subst.pl code
4 # The idea is that the .tt files are not valid HTML, and as a result
5 # HTML::Parse would be completely confused by these templates.
6 # This is just a simple scanner (not a parser) & should give better results.
8 # This script is meant to be a drop-in replacement of text-extract.pl
10 # A grander plan: Code could be written to detect template variables and
11 # construct gettext-c-format-string-like meta-strings (e.g., "Results %s
12 # through %s of %s records" that will be more likely to be translatable
13 # to languages where word order is very unlike English word order.
14 # --> This will be relatively major rework, and requires corresponding
15 # rework in tmpl_process.pl
18 #use warnings; FIXME - Bug 2505
23 use vars
qw( $input );
24 use vars qw( $debug_dump_only_p );
25 use vars qw( $pedantic_p );
26 use vars qw( $allow_cformat_p ); # FOR TESTING PURPOSES ONLY!!
28 ###############################################################################
30 sub underline ($) { # for testing only
32 join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $s));
35 sub debug_dump ($) { # for testing only
37 print "re_tag_compat is /", TmplTokenizer::re_tag(1), "/\n";
39 my $s = TmplTokenizer::next_token $h;
40 last unless defined $s;
41 printf "%s\n", ('-' x 79);
42 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
43 printf "%s [line %d]:\n", $kind->to_string, $s->line_number;
44 printf "%4dH%s\n", length($t), underline($t);
45 if ($kind == TmplTokenType::TAG() && %$attr) {
46 printf "Attributes:\n";
47 for my $a (keys %$attr) {
48 my($key, $val, $val_orig, $order) = @{$attr->{$a}};
49 printf "%s = %dH%s -- %s\n", $a, length $val, underline $val,
53 if ($kind == TmplTokenType::TEXT_PARAMETRIZED()) {
54 printf "Form (c-format string):\n";
55 printf "%dH%s\n", length $s->form, underline $s->form;
56 printf "Parameters:\n";
58 for my $a ($s->parameters) {
60 printf "%%%d\$s = %dH%s\n", $i, length $t, underline $t;
64 if ($s->has_js_data) {
65 printf "JavaScript translatable strings:\n";
66 for my $t (@{$s->js_data}) {
67 printf "%dH%s\n", length $t->[3], underline $t->[3] if $t->[0]; # FIXME
73 ###############################################################################
75 sub text_extract ($) {
79 my $s = TmplTokenizer::next_token $h;
80 last unless defined $s;
81 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
82 if ($kind == TmplTokenType::TEXT()) {
83 $t = TmplTokenizer::trim $t;
84 $text{$t} = 1 if $t =~ /\S/s;
85 } elsif ($kind == TmplTokenType::TAG() && %$attr) {
86 # value [tag=input], meta
87 my $tag = lc($1) if $t =~ /^<(\S+)/s;
88 for my $a ('alt', 'content', 'title', 'value') {
90 next if $a eq 'content' && $tag ne 'meta';
91 next if $a eq 'value' && ($tag ne 'input'
92 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
93 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
94 $val = TmplTokenizer::trim $val;
95 $text{$val} = 1 if $val =~ /\S/s;
98 } elsif ($s->has_js_data) {
99 for my $t (@{$s->js_data}) {
100 remember( $s, $t->[3] ) if $t->[0]; # FIXME
104 # Emit all extracted strings.
105 # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
106 for my $t (keys %text) {
108 unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
112 ###############################################################################
116 my $h = $exitcode? *STDERR: *STDOUT;
119 Extract strings from HTML file.
121 --debug-dump-only Do not extract strings; but display scanned tokens
122 -f, --file=FILE Extract from the specified FILE
123 --pedantic-warnings Issue warnings even for detected problems which
124 are likely to be harmless
125 --help Display this help and exit
130 ###############################################################################
132 sub usage_error
(;$) {
133 print STDERR
"$_[0]\n" if @_;
134 print STDERR
"Try `$0 --help' for more information.\n";
138 ###############################################################################
141 'enable-cformat' => \
$allow_cformat_p,
142 'f|file=s' => \
$input,
143 'debug-dump-only' => \
$debug_dump_only_p,
144 'pedantic-warnings' => sub { $pedantic_p = 1 },
145 'help' => sub { usage
(0) },
148 VerboseWarnings
::set_application_name
$0;
149 VerboseWarnings
::set_input_file_name
$input;
150 VerboseWarnings
::set_pedantic_mode
$pedantic_p;
152 usage_error
('Missing mandatory option -f') unless defined $input;
154 my $h = TmplTokenizer
->new( $input );
155 $h->set_allow_cformat( 1 ) if $allow_cformat_p;
156 if ($debug_dump_only_p) {
162 warn "This input will not work with Mozilla standards-compliant mode\n", undef
163 if TmplTokenizer
::syntaxerror_p
;
167 exit(-1) if TmplTokenizer
::fatal_p
;