2 #simple parser for HTML with Template Toolkit directives. Tokens are put into @tokens and are accesible via next_token and peep_token
4 use base
qw(HTML::Parser);
9 #seems to be handled post tokenizer
10 ##hash where key is tag we are interested in and the value is a hash of the attributes we want
11 #my %interesting_tags = (
12 # img => { alt => 1 },
15 #tokens found so far (used like a stack)
18 #shiftnext token or undef
23 #unshift token back on @tokens
26 unshift @tokens, shift;
29 #have a peep at next token
35 #please use this method INSTEAD of the HTML::Parser->parse_file method (and HTML::Parser->parse)
36 #signature build_tokens( self, filename)
38 my ($self, $filename) = @_;
39 $self->{filename
} = $filename;
40 $self->handler(start
=> "start", "self, line, tagname, attr, text"); #signature is start( self, linenumber, tagname, hash of attributes, origional text )
41 $self->handler(text
=> "text", "self, line, text, is_cdata"); #signature is text( self, linenumber, origional text, is_cdata )
42 $self->handler(end
=> "end", "self, line, tag, attr, text"); #signature is end( self, linenumber, tagename, origional text )
43 $self->handler(declaration
=> "declaration", "self, line, text, is_cdata"); # declaration
44 $self->handler(comment
=> "comment", "self, line, text, is_cdata"); # comments
45 # $self->handler(default => "default", "self, line, text, is_cdata"); # anything else
46 $self->marked_sections(1); #treat anything inside CDATA tags as text, should really make it a C4::TmplTokenType::CDATA
47 $self->unbroken_text(1); #make contiguous whitespace into a single token (can span multiple lines)
48 $self->parse_file($filename);
52 #handle parsing of text
56 my $work = shift; # original text
59 # if there is a template_toolkit tag
60 if( $work =~ m/\[%.*?\]/ ){
61 #everything before this tag is text (or possibly CDATA), add a text token to tokens if $`
63 my $t = C4::TmplToken->new( $`, ($is_cdata? C4
::TmplTokenType
::CDATA
: C4
::TmplTokenType
::TEXT
), $line, $self->{filename
} );
67 #the match itself is a DIRECTIVE $&
68 my $t = C4
::TmplToken
->new( $&, C4
::TmplTokenType
::DIRECTIVE
, $line, $self->{filename
} );
71 # put work still to do back into work
74 # If there is some left over work, treat it as text token
75 my $t = C4
::TmplToken
->new( $work, ($is_cdata? C4
::TmplTokenType
::CDATA
: C4
::TmplTokenType
::TEXT
), $line, $self->{filename
} );
86 my $work = shift; #original text
88 my $t = C4
::TmplToken
->new( $work, ($is_cdata? C4
::TmplTokenType
::CDATA
: C4
::TmplTokenType
::TEXT
), $line, $self->{filename
} );
95 my $work = shift; #original text
97 my $t = C4
::TmplToken
->new( $work, ($is_cdata? C4
::TmplTokenType
::CDATA
: C4
::TmplTokenType
::TEXT
), $line, $self->{filename
} );
104 my $work = shift; #original text
105 my $is_cdata = shift;
106 my $t = C4
::TmplToken
->new( $work, ($is_cdata? C4
::TmplTokenType
::CDATA
: C4
::TmplTokenType
::TEXT
), $line, $self->{filename
} );
111 #handle opening html tags
116 my $hash = shift; #hash of attr/value pairs
117 my $text = shift; #origional text
118 my $t = C4
::TmplToken
->new( $text, C4
::TmplTokenType
::TAG
, $line, $self->{filename
});
120 # tags seem to be uses in an 'interesting' way elsewhere..
121 for my $key( %$hash ) {
122 next unless defined $hash->{$key};
124 $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 1 ];
127 $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 0 ];
130 $t->set_attributes( \
%attr );
134 #handle closing html tags
141 # what format should this be in?
142 my $t = C4
::TmplToken
->new( $text, C4
::TmplTokenType
::TAG
, $line, $self->{filename
} );
144 # tags seem to be uses in an 'interesting' way elsewhere..
145 for my $key( %$hash ) {
146 next unless defined $hash->{$key};
147 $attr{+lc($key)} = [ $key, $hash->{$key}, $key."=".$hash->{$key}, 0 ];
149 $t->set_attributes( \
%attr );