1 package HTML
::TreeBuilder
::XPath
;
11 my %CHAR2DEFAULT_ENT= ( '&' => '&', '<' => '<', '>' => '>', '"' => '"e;');
12 my %NUM2DEFAULT_ENT= ( '38' => 'amp', '60' => 'lt', '62' => 'gt', '"' => '"e;');
14 package HTML::TreeBuilder::XPath;
16 use base( 'HTML::TreeBuilder');
19 package HTML::TreeBuilder::XPath::Node;
21 sub isElementNode { 0 }
22 sub isAttributeNode { 0 }
23 sub isNamespaceNode { 0 }
25 sub isProcessingInstructionNode { 0 }
27 sub isCommentNode { 0 }
29 sub getChildNodes { return wantarray ? () : []; }
30 sub getFirstChild { return undef; }
31 sub getLastChild { return undef; }
34 { my ($self, $id) = @_;
35 return scalar $self->look_down( id => $id);
38 sub to_number { return XML::XPathEngine::Number->new( shift->getValue); }
43 # comparison with the root (in $b, or processed in HTML::TreeBuilder::XPath::Root)
44 if( $b->isa( 'HTML::TreeBuilder::XPath::Root') ) { return -1; }
47 return 0 if( $a == $b);
48 return 1 if( $a->is_inside($b)); # a starts after b
49 return -1 if( $b->is_inside($a)); # a starts before b
51 # lineage does not include the element itself
52 my @a_pile= ($a, $a->lineage);
53 my @b_pile= ($b, $b->lineage);
55 # the 2 elements are not in the same twig
56 unless( $a_pile[-1] == $b_pile[-1])
57 { warn "2 nodes not in the same pile: ", ref( $a), " - ", ref( $b), "\n";
58 print "a: ", $a->string_value, "\nb: ", $b->string_value, "\n";
62 # find the first non common ancestors (they are siblings)
63 my $a_anc= pop @a_pile;
64 my $b_anc= pop @b_pile;
66 while( $a_anc == $b_anc)
67 { $a_anc= pop @a_pile;
71 if( defined( $a_anc->{_rank}) && defined( $b_anc->{_rank}))
72 { return $a_anc->{_rank} <=> $b_anc->{_rank}; }
75 # from there move left and right and figure out the order
76 my( $a_prev, $a_next, $b_prev, $b_next)= ($a_anc, $a_anc, $b_anc, $b_anc);
78 { $a_prev= $a_prev->getPreviousSibling || return -1;
79 return 1 if( $a_prev == $b_anc);
80 $a_next= $a_next->getNextSibling || return 1;
81 return -1 if( $a_next == $b_anc);
82 $b_prev= $b_prev->getPreviousSibling || return 1;
83 return -1 if( $b_prev == $a_next);
84 $b_next= $b_next->getNextSibling || return -1;
85 return 1 if( $b_next == $a_prev);
91 # need to modify directly the HTML::Element package, because HTML::TreeBuilder won't let me
92 # change the class of the nodes it generates
93 package HTML::Element;
94 use Scalar::Util qw(weaken);
97 push @ISA, 'HTML::TreeBuilder::XPath::Node';
103 { $xp ||=XML::XPathEngine->new();
108 sub findnodes { my( $elt, $path)= @_; return xp->findnodes( $path, $elt); }
109 sub findnodes_as_string { my( $elt, $path)= @_; return xp->findnodes_as_string( $path, $elt); }
110 sub findnodes_as_strings { my( $elt, $path)= @_; return xp->findnodes_as_strings( $path, $elt); }
111 sub findvalue { my( $elt, $path)= @_; return xp->findvalue( $path, $elt); }
112 sub exists { my( $elt, $path)= @_; return xp->exists( $path, $elt); }
113 sub find_xpath { my( $elt, $path)= @_; return xp->find( $path, $elt); }
114 sub matches { my( $elt, $path)= @_; return xp->matches( $elt, $path, $elt); }
115 sub set_namespace { my $elt= shift; xp->new->set_namespace( @_); }
119 # The parent of root is a HTML::TreeBuilder::XPath::Root
120 # that helps getting the tree to mimic a DOM tree
121 return $elt->root->getParentNode; # I like this one!
126 return $elt->{_parent} || bless { _root => $elt }, 'HTML::TreeBuilder::XPath::Root';
128 sub getName { return shift->tag; }
129 sub getNextSibling { my( $elt)= @_;
130 my $parent= $elt->{_parent} || return undef;
131 return $parent->_child_as_object( scalar $elt->right, ($elt->{_rank} || 0) + 1);
133 sub getPreviousSibling { my( $elt)= @_;
134 my $parent= $elt->{_parent} || return undef;
135 return undef unless $elt->{_rank};
136 return $parent->_child_as_object( scalar $elt->left, $elt->{_rank} - 1);
138 sub isElementNode { return ref $_[0] && ($_[0]->{_tag}!~ m{^~}) ? 1 : 0; }
139 sub isCommentNode { return ref $_[0] && ($_[0]->{_tag} eq '~comment') ? 1 : 0; }
140 sub isProcessingInstructionNode { return ref $_[0] && ($_[0]->{_tag} eq '~pi') ? 1 : 0; }
141 sub isTextNode { return ref $_[0] ? 0 : 1; }
145 if( $elt->isCommentNode) { return $elt->{_text}; }
146 return $elt->as_text;
152 my @children= map { $parent->_child_as_object( $_, $rank++) } $parent->content_list;
153 return wantarray ? @children : \@children;
158 my @content= $parent->content_list;
160 { return $parent->_child_as_object( $content[0], 0); }
166 my @content= $parent->content_list;
168 { return $parent->_child_as_object( $content[-1], $#content); }
175 my %atts= $elt->all_external_attr;
177 my @atts= map { bless( { _name => $_, _value => $atts{$_},
178 _elt => $elt, _rank => $rank++,
180 'HTML::TreeBuilder::XPath::Attribute'
183 return wantarray ? @atts : \@atts;
186 sub to_number { return XML::XPathEngine::Number->new( $_[0]->as_text); }
189 if( $elt->isCommentNode) { return $elt->{_text}; }
190 return $elt->as_text;
193 # called on a parent, with a child as second argument and its rank as third
194 # returns the child if it is already an element, or
195 # a new HTML::TreeBuilder::XPath::Text element if it is a plain string
197 { my( $elt, $elt_or_text, $rank)= @_;
198 return undef unless( defined $elt_or_text);
199 if( ! ref $elt_or_text)
200 { # $elt_or_text is a string, turn it into a TextNode object
201 $elt_or_text= bless { _content => $elt_or_text, _parent => $elt, },
202 'HTML::TreeBuilder::XPath::TextNode'
205 if( ref $rank) { warn "rank is a ", ref( $rank), " elt_or_text is a ", ref( $elt_or_text); }
206 $elt_or_text->{_rank}= $rank; # used for sorting;
210 sub toString { return shift->as_XML_clean( @_); }
212 # produces better looking XML
213 { my( $indent, %return_before_endtag);
216 %return_before_endtag= map { $_ => 1 } qw(html head body script div table tr form ol ul);
220 { my( $node, $indent_level)= @_;
223 my $wrapping_nl= "\n";
225 if( !defined( $indent_level)) { $indent_level = 0; $wrapping_nl= ''; }
227 my $name = $node->{'_tag'};
228 if( $HTML::Tagset
::isKnown
{lc $name} && !$HTML::Tagset
::isPhraseMarkup
{lc $name} && $indent_level > 0)
229 { $xml.= $wrapping_nl . ($indent x
$indent_level); }
231 if( $name eq '~literal') { $xml= _xml_escape_text
( $node->{text
}); }
232 elsif( $name eq '~declaration') { $xml= '<!' . _xml_escape_text
( $node->{text
}) . '>'; }
233 elsif( $name eq '~pi') { $xml= '<?' . _xml_escape_text
( $node->{text
}) . '?>'; }
234 elsif( $name eq '~comment') { $xml= '<--' . _xml_escape_comment
( $node->{text
}) . '-->'; }
235 elsif( $HTML::Tagset
::isCDATA_Parent
{lc $name})
236 { $xml.= $node->_start_tag;
237 my $content= $node->{_content
} || '';
238 if( ref $content eq 'ARRAY' || $content->isa( 'ARRAY'))
239 { $xml .= _xml_escape_cdata
( join( '', @
$content)); }
240 if( $return_before_endtag{lc $name}) { $xml.= "\n" . ($indent x
$indent_level); }
244 $xml.= $node->_start_tag;
245 my $child_indent_level= $HTML::Tagset
{lc $name} ?
$indent_level : $indent_level+1;
246 foreach my $child ($node->content_list)
247 { if( ref $child) { $xml .= $child->as_XML_clean( $child_indent_level); }
248 else { $xml .= _xml_escape_text
( $child); }
250 if( $return_before_endtag{lc $name}) { $xml.= "\n" . ($indent x
$indent_level); }
252 $xml .="</$name>" unless $HTML::Tagset
::emptyElement
{lc $name};
253 if( $indent_level == 0) { $xml .= $wrapping_nl; }
261 my $name = $node->{'_tag'};
262 my $start_tag.= "<$name";
263 foreach my $att (sort keys %$node)
264 { next if( (!length $att) || ($att=~ m{^_}) || ($att eq '/') );
265 $start_tag .= qq{ $att="} . _xml_escape_attribute_value $node->{$att} . qq{"};
267 $start_tag.= $HTML::Tagset
::emptyElement
{lc $name} ?
" />" : ">";
273 my $level= scalar grep { !$HTML::Tagset
::isPhraseMarkup
{lc $_->{_tag
}} } $node->lineage;
278 sub _xml_escape_attribute_value
280 $text=~ s{([&<>"])}{$CHAR2DEFAULT_ENT{$1}}g
; # escape also quote, as it is the attribute separator
286 $text=~ s{([&<>])}{$CHAR2DEFAULT_ENT{$1}}g
;
290 sub _xml_escape_comment
292 $text=~ s{([&<>])}{$CHAR2DEFAULT_ENT{$1}}g
;
293 $text=~ s{--}{--}g; # can't have double --'s in XML comments
297 sub _xml_escape_cdata
299 $text=~ s{^\s*\Q<![CDATA[}{}s;
300 $text=~ s{\Q]]>\E\s*$}{}s;
301 $text=~ s{]]>}{]]>}g; # can't have]]> in CDATA
302 $text= "<![CDATA[$text]]>";
307 package HTML
::TreeBuilder
::XPath
::TextNode
;
309 use base
'HTML::TreeBuilder::XPath::Node';
311 sub getParentNode
{ return shift->{_parent
}; }
312 sub getValue
{ return shift->{_content
}; }
313 sub isTextNode
{ return 1; }
314 sub getAttributes
{ return wantarray ?
() : []; }
316 # similar to HTML::Element as_XML
318 { my( $node, $entities)= @_;
319 my $content= $node->{_content
};
320 if( $node->{_parent
} && $node->{_parent
}->{_tag
} eq 'script')
321 { $content=~ s{(&\w+;)}{HTML::Entities::decode($1)}eg; }
323 { HTML
::Element
::_xml_escape
($content); }
328 { my( $node, $entities)= @_;
329 my $content= $node->{_content
};
330 if( $node->{_parent
} && $node->{_parent
}->{_tag
} eq 'script')
331 { $content=~ s{(&\w+;)}{HTML::Entities::decode($1)}eg; }
333 { $content= HTML
::Element
::_xml_escape_text
($content); }
337 sub getPreviousSibling
339 my $rank= $self->{_rank
};
340 #unless( defined $self->{_rank})
341 # { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
342 my $parent= $self->{_parent
};
343 return $rank ?
$parent->_child_as_object( $parent->{_content
}->[$rank-1], $rank-1) : undef;
348 my $rank= $self->{_rank
};
349 #unless( defined $self->{_rank})
350 # { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
351 my $parent= $self->{_parent
};
352 my $next_sibling= $parent->{_content
}->[$rank+1];
353 return defined( $next_sibling) ?
$parent->_child_as_object( $next_sibling, $rank+1) : undef;
357 { return shift->{_parent
}->getRootNode; }
359 sub string_value
{ return shift->{_content
}; }
361 # added to provide element-like methods to text nodes, for use by cmp
364 my $parent= $node->{_parent
};
365 return( $parent, $parent->lineage);
369 { my( $text, $node)= @_;
370 return $text->{_parent
}->is_inside( $node);
376 package HTML
::TreeBuilder
::XPath
::Attribute
;
377 use base
'HTML::TreeBuilder::XPath::Node';
379 sub getParentNode
{ return $_[0]->{_elt
}; }
380 sub getValue
{ return $_[0]->{_value
}; }
381 sub getName
{ return $_[0]->{_name
} ; }
382 sub getLocalName
{ (my $name= $_[0]->{_name
}) =~ s{^.*:}{}; $name; }
383 sub string_value
{ return $_[0]->{_value
}; }
384 sub to_number
{ return XML
::XPathEngine
::Number
->new( $_[0]->{_value
}); }
385 sub isAttributeNode
{ 1 }
386 sub toString
{ return qq{$_[0]->{_name
}="$_[0]->{_value}"}; }
388 # awfully inefficient, but hopefully this is called only for weird (read test-case) queries
389 sub getPreviousSibling
391 my $rank= $self->{_rank
};
392 return undef unless $rank;
393 my %atts= $self->{_elt
}->all_external_attr;
394 my $previous_att_name= (sort keys %atts)[$rank-1];
395 return bless( { _name
=> $previous_att_name,
396 _value
=> $atts{$previous_att_name},
397 _elt
=> $self->{_elt
}, _rank
=> $rank-1,
398 }, 'HTML::TreeBuilder::XPath::Attribute'
404 my $rank= $self->{_rank
};
405 my %atts= $self->{_elt
}->all_external_attr;
406 my $next_att_name= (sort keys %atts)[$rank+1] || return undef;
407 return bless( { _name
=> $next_att_name, _value
=> $atts{$next_att_name},
408 _elt
=> $self->{_elt
}, _rank
=> $rank+1,
409 }, 'HTML::TreeBuilder::XPath::Attribute'
416 # added to provide element-like methods to attributes, for use by cmp
419 my $elt= $att->{_elt
};
420 return( $elt, $elt->lineage);
424 { my( $att, $node)= @_;
425 return ($att->{_elt
} == $node) || $att->{_elt
}->is_inside( $node);
431 package HTML
::TreeBuilder
::XPath
::Root
;
433 use base
'HTML::TreeBuilder::XPath::Node';
435 sub getParentNode
{ return (); }
436 sub getChildNodes
{ my @content= ( $_[0]->{_root
}); return wantarray ?
@content : \
@content; }
437 sub getAttributes
{ return [] }
438 sub isDocumentNode
{ return 1 }
440 # added to provide element-like methods to root, for use by cmp
441 sub lineage
{ return ($_[0]); }
442 sub is_inside
{ return 0; }
443 sub cmp { return $_[1]->isa( ' HTML::TreeBuilder::XPath::Root') ?
0 : 1; }
450 HTML::TreeBuilder::XPath - add XPath support to HTML::TreeBuilder
454 use HTML::TreeBuilder::XPath;
455 my $tree= HTML::TreeBuilder::XPath->new;
456 $tree->parse_file( "mypage.html");
457 my $nb=$tree->findvalue( '/html/body//p[@class="section_title"]/span[@class="nb"]');
458 my $id=$tree->findvalue( '/html/body//p[@class="section_title"]/@id');
460 my $p= $html->findnodes( '//p[@id="toto"]')->[0];
461 my $link_texts= $p->findvalue( './a'); # the texts of all a elements in $p
466 This module adds typical XPath methods to HTML::TreeBuilder, to make it
467 easy to query a document.
471 Extra methods added both to the tree object and to each element:
473 =head2 findnodes ($path)
475 Returns a list of nodes found by C<$path>.
476 In scalar context returns an C<Tree::XPathEngine::NodeSet> object.
478 =head2 findnodes_as_string ($path)
480 Returns the text values of the nodes, as one string.
482 =head2 findnodes_as_strings ($path)
484 Returns a list of the values of the result nodes.
486 =head2 findvalue ($path)
488 Returns either a C<Tree::XPathEngine::Literal>, a C<Tree::XPathEngine::Boolean>
489 or a C<Tree::XPathEngine::Number> object. If the path returns a NodeSet,
490 $nodeset->xpath_to_literal is called automatically for you (and thus a
491 C<Tree::XPathEngine::Literal> is returned). Note that
492 for each of the objects stringification is overloaded, so you can just
493 print the value found, or manipulate it in the ways you would a normal
494 perl value (e.g. using regular expressions).
496 =head2 exists ($path)
498 Returns true if the given path exists.
500 =head2 matches($path)
502 Returns true if the element matches the path.
506 The find function takes an XPath expression (a string) and returns either a
507 Tree::XPathEngine::NodeSet object containing the nodes it found (or empty if
508 no nodes matched the path), or one of XML::XPathEngine::Literal (a string),
509 XML::XPathEngine::Number, or XML::XPathEngine::Boolean. It should always
510 return something - and you can use ->isa() to find out what it returned. If
511 you need to check how many nodes it found you should check $nodeset->size.
512 See L<XML::XPathEngine::NodeSet>.
514 =head2 as_XML_clean ($optional_indent_level)
516 HTML::TreeBuilder's C<as_XML> output is not really nice to look at, so
517 I added a new method, that can be used as a simple replacement for it.
518 It escapes only the '<', '>' and '&' (plus '"' in attribute values), and
519 wraps CDATA elements in CDATA sections.
521 The C<$optional_indent_level> defaults to the level in the original HTML
522 document (ie you probably don't have to use it)
524 This method is currently in alpha state. Ping me if you want other options added
535 Michel Rodriguez, E<lt>mirod@cpan.orgE<gt>
537 =head1 COPYRIGHT AND LICENSE
539 Copyright (C) 2006 by Michel Rodriguez
541 This library is free software; you can redistribute it and/or modify
542 it under the same terms as Perl itself, either Perl version 5.8.4 or,
543 at your option, any later version of Perl 5 you may have available.