fixed problem with comment()
[HTML-TreeBuilder-XPath.git] / XPath_with_as_XML_clean.pm
blob27b387ddd704802305630e4fa2373d0d40da293a
1 package HTML::TreeBuilder::XPath;
4 use strict;
5 use warnings;
7 use vars qw($VERSION);
9 $VERSION = '0.10';
11 my %CHAR2DEFAULT_ENT= ( '&' => '&amp;', '<' => '&lt;', '>' => '&gt;', '"' => '&quote;');
12 my %NUM2DEFAULT_ENT= ( '38' => 'amp', '60' => 'lt', '62' => 'gt', '"' => '&quote;');
14 package HTML::TreeBuilder::XPath;
16 use base( 'HTML::TreeBuilder');
19 package HTML::TreeBuilder::XPath::Node;
21 sub isElementNode { 0 }
22 sub isAttributeNode { 0 }
23 sub isNamespaceNode { 0 }
24 sub isTextNode { 0 }
25 sub isProcessingInstructionNode { 0 }
26 sub isPINode { 0 }
27 sub isCommentNode { 0 }
29 sub getChildNodes { return wantarray ? () : []; }
30 sub getFirstChild { return undef; }
31 sub getLastChild { return undef; }
33 sub getElementById
34 { my ($self, $id) = @_;
35 return scalar $self->look_down( id => $id);
38 sub to_number { return XML::XPathEngine::Number->new( shift->getValue); }
40 sub cmp
41 { my( $a, $b)=@_;
43 # comparison with the root (in $b, or processed in HTML::TreeBuilder::XPath::Root)
44 if( $b->isa( 'HTML::TreeBuilder::XPath::Root') ) { return -1; }
46 # easy cases
47 return 0 if( $a == $b);
48 return 1 if( $a->is_inside($b)); # a starts after b
49 return -1 if( $b->is_inside($a)); # a starts before b
51 # lineage does not include the element itself
52 my @a_pile= ($a, $a->lineage);
53 my @b_pile= ($b, $b->lineage);
55 # the 2 elements are not in the same twig
56 unless( $a_pile[-1] == $b_pile[-1])
57 { warn "2 nodes not in the same pile: ", ref( $a), " - ", ref( $b), "\n";
58 print "a: ", $a->string_value, "\nb: ", $b->string_value, "\n";
59 return undef;
62 # find the first non common ancestors (they are siblings)
63 my $a_anc= pop @a_pile;
64 my $b_anc= pop @b_pile;
66 while( $a_anc == $b_anc)
67 { $a_anc= pop @a_pile;
68 $b_anc= pop @b_pile;
71 if( defined( $a_anc->{_rank}) && defined( $b_anc->{_rank}))
72 { return $a_anc->{_rank} <=> $b_anc->{_rank}; }
73 else
75 # from there move left and right and figure out the order
76 my( $a_prev, $a_next, $b_prev, $b_next)= ($a_anc, $a_anc, $b_anc, $b_anc);
77 while()
78 { $a_prev= $a_prev->getPreviousSibling || return -1;
79 return 1 if( $a_prev == $b_anc);
80 $a_next= $a_next->getNextSibling || return 1;
81 return -1 if( $a_next == $b_anc);
82 $b_prev= $b_prev->getPreviousSibling || return 1;
83 return -1 if( $b_prev == $a_next);
84 $b_next= $b_next->getNextSibling || return -1;
85 return 1 if( $b_next == $a_prev);
91 # need to modify directly the HTML::Element package, because HTML::TreeBuilder won't let me
92 # change the class of the nodes it generates
93 package HTML::Element;
94 use Scalar::Util qw(weaken);
95 use vars qw(@ISA);
97 push @ISA, 'HTML::TreeBuilder::XPath::Node';
99 use XML::XPathEngine;
101 { my $xp;
102 sub xp
103 { $xp ||=XML::XPathEngine->new();
104 return $xp;
108 sub findnodes { my( $elt, $path)= @_; return xp->findnodes( $path, $elt); }
109 sub findnodes_as_string { my( $elt, $path)= @_; return xp->findnodes_as_string( $path, $elt); }
110 sub findnodes_as_strings { my( $elt, $path)= @_; return xp->findnodes_as_strings( $path, $elt); }
111 sub findvalue { my( $elt, $path)= @_; return xp->findvalue( $path, $elt); }
112 sub exists { my( $elt, $path)= @_; return xp->exists( $path, $elt); }
113 sub find_xpath { my( $elt, $path)= @_; return xp->find( $path, $elt); }
114 sub matches { my( $elt, $path)= @_; return xp->matches( $elt, $path, $elt); }
115 sub set_namespace { my $elt= shift; xp->new->set_namespace( @_); }
117 sub getRootNode
118 { my $elt= shift;
119 # The parent of root is a HTML::TreeBuilder::XPath::Root
120 # that helps getting the tree to mimic a DOM tree
121 return $elt->root->getParentNode; # I like this one!
124 sub getParentNode
125 { my $elt= shift;
126 return $elt->{_parent} || bless { _root => $elt }, 'HTML::TreeBuilder::XPath::Root';
128 sub getName { return shift->tag; }
129 sub getNextSibling { my( $elt)= @_;
130 my $parent= $elt->{_parent} || return undef;
131 return $parent->_child_as_object( scalar $elt->right, ($elt->{_rank} || 0) + 1);
133 sub getPreviousSibling { my( $elt)= @_;
134 my $parent= $elt->{_parent} || return undef;
135 return undef unless $elt->{_rank};
136 return $parent->_child_as_object( scalar $elt->left, $elt->{_rank} - 1);
138 sub isElementNode { return ref $_[0] && ($_[0]->{_tag}!~ m{^~}) ? 1 : 0; }
139 sub isCommentNode { return ref $_[0] && ($_[0]->{_tag} eq '~comment') ? 1 : 0; }
140 sub isProcessingInstructionNode { return ref $_[0] && ($_[0]->{_tag} eq '~pi') ? 1 : 0; }
141 sub isTextNode { return ref $_[0] ? 0 : 1; }
143 sub getValue
144 { my $elt= shift;
145 if( $elt->isCommentNode) { return $elt->{_text}; }
146 return $elt->as_text;
149 sub getChildNodes
150 { my $parent= shift;
151 my $rank=0;
152 my @children= map { $parent->_child_as_object( $_, $rank++) } $parent->content_list;
153 return wantarray ? @children : \@children;
156 sub getFirstChild
157 { my $parent= shift;
158 my @content= $parent->content_list;
159 if( @content)
160 { return $parent->_child_as_object( $content[0], 0); }
161 else
162 { return undef; }
164 sub getLastChild
165 { my $parent= shift;
166 my @content= $parent->content_list;
167 if( @content)
168 { return $parent->_child_as_object( $content[-1], $#content); }
169 else
170 { return undef; }
173 sub getAttributes
174 { my $elt= shift;
175 my %atts= $elt->all_external_attr;
176 my $rank=0;
177 my @atts= map { bless( { _name => $_, _value => $atts{$_},
178 _elt => $elt, _rank => $rank++,
180 'HTML::TreeBuilder::XPath::Attribute'
182 } sort keys %atts;
183 return wantarray ? @atts : \@atts;
186 sub to_number { return XML::XPathEngine::Number->new( $_[0]->as_text); }
187 sub string_value
188 { my $elt= shift;
189 if( $elt->isCommentNode) { return $elt->{_text}; }
190 return $elt->as_text;
193 # called on a parent, with a child as second argument and its rank as third
194 # returns the child if it is already an element, or
195 # a new HTML::TreeBuilder::XPath::Text element if it is a plain string
196 sub _child_as_object
197 { my( $elt, $elt_or_text, $rank)= @_;
198 return undef unless( defined $elt_or_text);
199 if( ! ref $elt_or_text)
200 { # $elt_or_text is a string, turn it into a TextNode object
201 $elt_or_text= bless { _content => $elt_or_text, _parent => $elt, },
202 'HTML::TreeBuilder::XPath::TextNode'
205 if( ref $rank) { warn "rank is a ", ref( $rank), " elt_or_text is a ", ref( $elt_or_text); }
206 $elt_or_text->{_rank}= $rank; # used for sorting;
207 return $elt_or_text;
210 sub toString { return shift->as_XML_clean( @_); }
212 # produces better looking XML
213 { my( $indent, %return_before_endtag);
214 BEGIN
215 { $indent= ' ';
216 %return_before_endtag= map { $_ => 1 } qw(html head body script div table tr form ol ul);
219 sub as_XML_clean
220 { my( $node, $indent_level)= @_;
222 my $xml= '';
223 my $wrapping_nl= "\n";
225 if( !defined( $indent_level)) { $indent_level = 0; $wrapping_nl= ''; }
227 my $name = $node->{'_tag'};
228 if( $HTML::Tagset::isKnown{lc $name} && !$HTML::Tagset::isPhraseMarkup{lc $name} && $indent_level > 0)
229 { $xml.= $wrapping_nl . ($indent x $indent_level); }
231 if( $name eq '~literal') { $xml= _xml_escape_text( $node->{text}); }
232 elsif( $name eq '~declaration') { $xml= '<!' . _xml_escape_text( $node->{text}) . '>'; }
233 elsif( $name eq '~pi') { $xml= '<?' . _xml_escape_text( $node->{text}) . '?>'; }
234 elsif( $name eq '~comment') { $xml= '<--' . _xml_escape_comment( $node->{text}) . '-->'; }
235 elsif( $HTML::Tagset::isCDATA_Parent{lc $name})
236 { $xml.= $node->_start_tag;
237 my $content= $node->{_content} || '';
238 if( ref $content eq 'ARRAY' || $content->isa( 'ARRAY'))
239 { $xml .= _xml_escape_cdata( join( '', @$content)); }
240 if( $return_before_endtag{lc $name}) { $xml.= "\n" . ($indent x $indent_level); }
242 else
243 { # start tag
244 $xml.= $node->_start_tag;
245 my $child_indent_level= $HTML::Tagset{lc $name} ? $indent_level : $indent_level+1;
246 foreach my $child ($node->content_list)
247 { if( ref $child) { $xml .= $child->as_XML_clean( $child_indent_level); }
248 else { $xml .= _xml_escape_text( $child); }
250 if( $return_before_endtag{lc $name}) { $xml.= "\n" . ($indent x $indent_level); }
252 $xml .="</$name>" unless $HTML::Tagset::emptyElement{lc $name};
253 if( $indent_level == 0) { $xml .= $wrapping_nl; }
254 return $xml;
259 sub _start_tag
260 { my( $node)= @_;
261 my $name = $node->{'_tag'};
262 my $start_tag.= "<$name";
263 foreach my $att (sort keys %$node)
264 { next if( (!length $att) || ($att=~ m{^_}) || ($att eq '/') );
265 $start_tag .= qq{ $att="} . _xml_escape_attribute_value $node->{$att} . qq{"};
267 $start_tag.= $HTML::Tagset::emptyElement{lc $name} ? " />" : ">";
268 return $start_tag;
271 sub _indent_level
272 { my( $node)= @_;
273 my $level= scalar grep { !$HTML::Tagset::isPhraseMarkup{lc $_->{_tag}} } $node->lineage;
274 return $level;
278 sub _xml_escape_attribute_value
279 { my( $text)= @_;
280 $text=~ s{([&<>"])}{$CHAR2DEFAULT_ENT{$1}}g; # escape also quote, as it is the attribute separator
281 return $text;
284 sub _xml_escape_text
285 { my( $text)= @_;
286 $text=~ s{([&<>])}{$CHAR2DEFAULT_ENT{$1}}g;
287 return $text;
290 sub _xml_escape_comment
291 { my( $text)= @_;
292 $text=~ s{([&<>])}{$CHAR2DEFAULT_ENT{$1}}g;
293 $text=~ s{--}{-&#45;}g; # can't have double --'s in XML comments
294 return $text;
297 sub _xml_escape_cdata
298 { my( $text)= @_;
299 $text=~ s{^\s*\Q<![CDATA[}{}s;
300 $text=~ s{\Q]]>\E\s*$}{}s;
301 $text=~ s{]]>}{]]&#62;}g; # can't have]]> in CDATA
302 $text= "<![CDATA[$text]]>";
303 return $text;
307 package HTML::TreeBuilder::XPath::TextNode;
309 use base 'HTML::TreeBuilder::XPath::Node';
311 sub getParentNode { return shift->{_parent}; }
312 sub getValue { return shift->{_content}; }
313 sub isTextNode { return 1; }
314 sub getAttributes { return wantarray ? () : []; }
316 # similar to HTML::Element as_XML
317 sub as_XML
318 { my( $node, $entities)= @_;
319 my $content= $node->{_content};
320 if( $node->{_parent} && $node->{_parent}->{_tag} eq 'script')
321 { $content=~ s{(&\w+;)}{HTML::Entities::decode($1)}eg; }
322 else
323 { HTML::Element::_xml_escape($content); }
324 return $content;
327 sub as_XML_clean
328 { my( $node, $entities)= @_;
329 my $content= $node->{_content};
330 if( $node->{_parent} && $node->{_parent}->{_tag} eq 'script')
331 { $content=~ s{(&\w+;)}{HTML::Entities::decode($1)}eg; }
332 else
333 { $content= HTML::Element::_xml_escape_text($content); }
334 return $content;
337 sub getPreviousSibling
338 { my $self= shift;
339 my $rank= $self->{_rank};
340 #unless( defined $self->{_rank})
341 # { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
342 my $parent= $self->{_parent};
343 return $rank ? $parent->_child_as_object( $parent->{_content}->[$rank-1], $rank-1) : undef;
346 sub getNextSibling
347 { my $self= shift;
348 my $rank= $self->{_rank};
349 #unless( defined $self->{_rank})
350 # { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
351 my $parent= $self->{_parent};
352 my $next_sibling= $parent->{_content}->[$rank+1];
353 return defined( $next_sibling) ? $parent->_child_as_object( $next_sibling, $rank+1) : undef;
356 sub getRootNode
357 { return shift->{_parent}->getRootNode; }
359 sub string_value { return shift->{_content}; }
361 # added to provide element-like methods to text nodes, for use by cmp
362 sub lineage
363 { my( $node)= @_;
364 my $parent= $node->{_parent};
365 return( $parent, $parent->lineage);
368 sub is_inside
369 { my( $text, $node)= @_;
370 return $text->{_parent}->is_inside( $node);
376 package HTML::TreeBuilder::XPath::Attribute;
377 use base 'HTML::TreeBuilder::XPath::Node';
379 sub getParentNode { return $_[0]->{_elt}; }
380 sub getValue { return $_[0]->{_value}; }
381 sub getName { return $_[0]->{_name} ; }
382 sub getLocalName { (my $name= $_[0]->{_name}) =~ s{^.*:}{}; $name; }
383 sub string_value { return $_[0]->{_value}; }
384 sub to_number { return XML::XPathEngine::Number->new( $_[0]->{_value}); }
385 sub isAttributeNode { 1 }
386 sub toString { return qq{$_[0]->{_name}="$_[0]->{_value}"}; }
388 # awfully inefficient, but hopefully this is called only for weird (read test-case) queries
389 sub getPreviousSibling
390 { my $self= shift;
391 my $rank= $self->{_rank};
392 return undef unless $rank;
393 my %atts= $self->{_elt}->all_external_attr;
394 my $previous_att_name= (sort keys %atts)[$rank-1];
395 return bless( { _name => $previous_att_name,
396 _value => $atts{$previous_att_name},
397 _elt => $self->{_elt}, _rank => $rank-1,
398 }, 'HTML::TreeBuilder::XPath::Attribute'
402 sub getNextSibling
403 { my $self= shift;
404 my $rank= $self->{_rank};
405 my %atts= $self->{_elt}->all_external_attr;
406 my $next_att_name= (sort keys %atts)[$rank+1] || return undef;
407 return bless( { _name => $next_att_name, _value => $atts{$next_att_name},
408 _elt => $self->{_elt}, _rank => $rank+1,
409 }, 'HTML::TreeBuilder::XPath::Attribute'
416 # added to provide element-like methods to attributes, for use by cmp
417 sub lineage
418 { my( $att)= @_;
419 my $elt= $att->{_elt};
420 return( $elt, $elt->lineage);
423 sub is_inside
424 { my( $att, $node)= @_;
425 return ($att->{_elt} == $node) || $att->{_elt}->is_inside( $node);
431 package HTML::TreeBuilder::XPath::Root;
433 use base 'HTML::TreeBuilder::XPath::Node';
435 sub getParentNode { return (); }
436 sub getChildNodes { my @content= ( $_[0]->{_root}); return wantarray ? @content : \@content; }
437 sub getAttributes { return [] }
438 sub isDocumentNode { return 1 }
440 # added to provide element-like methods to root, for use by cmp
441 sub lineage { return ($_[0]); }
442 sub is_inside { return 0; }
443 sub cmp { return $_[1]->isa( ' HTML::TreeBuilder::XPath::Root') ? 0 : 1; }
447 __END__
448 =head1 NAME
450 HTML::TreeBuilder::XPath - add XPath support to HTML::TreeBuilder
452 =head1 SYNOPSIS
454 use HTML::TreeBuilder::XPath;
455 my $tree= HTML::TreeBuilder::XPath->new;
456 $tree->parse_file( "mypage.html");
457 my $nb=$tree->findvalue( '/html/body//p[@class="section_title"]/span[@class="nb"]');
458 my $id=$tree->findvalue( '/html/body//p[@class="section_title"]/@id');
460 my $p= $html->findnodes( '//p[@id="toto"]')->[0];
461 my $link_texts= $p->findvalue( './a'); # the texts of all a elements in $p
464 =head1 DESCRIPTION
466 This module adds typical XPath methods to HTML::TreeBuilder, to make it
467 easy to query a document.
469 =head1 METHODS
471 Extra methods added both to the tree object and to each element:
473 =head2 findnodes ($path)
475 Returns a list of nodes found by C<$path>.
476 In scalar context returns an C<Tree::XPathEngine::NodeSet> object.
478 =head2 findnodes_as_string ($path)
480 Returns the text values of the nodes, as one string.
482 =head2 findnodes_as_strings ($path)
484 Returns a list of the values of the result nodes.
486 =head2 findvalue ($path)
488 Returns either a C<Tree::XPathEngine::Literal>, a C<Tree::XPathEngine::Boolean>
489 or a C<Tree::XPathEngine::Number> object. If the path returns a NodeSet,
490 $nodeset->xpath_to_literal is called automatically for you (and thus a
491 C<Tree::XPathEngine::Literal> is returned). Note that
492 for each of the objects stringification is overloaded, so you can just
493 print the value found, or manipulate it in the ways you would a normal
494 perl value (e.g. using regular expressions).
496 =head2 exists ($path)
498 Returns true if the given path exists.
500 =head2 matches($path)
502 Returns true if the element matches the path.
504 =head2 find ($path)
506 The find function takes an XPath expression (a string) and returns either a
507 Tree::XPathEngine::NodeSet object containing the nodes it found (or empty if
508 no nodes matched the path), or one of XML::XPathEngine::Literal (a string),
509 XML::XPathEngine::Number, or XML::XPathEngine::Boolean. It should always
510 return something - and you can use ->isa() to find out what it returned. If
511 you need to check how many nodes it found you should check $nodeset->size.
512 See L<XML::XPathEngine::NodeSet>.
514 =head2 as_XML_clean ($optional_indent_level)
516 HTML::TreeBuilder's C<as_XML> output is not really nice to look at, so
517 I added a new method, that can be used as a simple replacement for it.
518 It escapes only the '<', '>' and '&' (plus '"' in attribute values), and
519 wraps CDATA elements in CDATA sections.
521 The C<$optional_indent_level> defaults to the level in the original HTML
522 document (ie you probably don't have to use it)
524 This method is currently in alpha state. Ping me if you want other options added
525 to it (wrapping?).
527 =head1 SEE ALSO
529 L<HTML::TreeBuilder>
531 L<XML::XPathEngine>
533 =head1 AUTHOR
535 Michel Rodriguez, E<lt>mirod@cpan.orgE<gt>
537 =head1 COPYRIGHT AND LICENSE
539 Copyright (C) 2006 by Michel Rodriguez
541 This library is free software; you can redistribute it and/or modify
542 it under the same terms as Perl itself, either Perl version 5.8.4 or,
543 at your option, any later version of Perl 5 you may have available.
546 =cut