XPath_with_as_XML_clean.pm

   1 package HTML::TreeBuilder::XPath;
   2
   3
   4 use strict;
   5 use warnings;
   6
   7 use vars qw($VERSION);
   8
   9 $VERSION = '0.10';
  10
  11 my %CHAR2DEFAULT_ENT= ( '&' => '&amp;', '<' => '&lt;', '>' => '&gt;', '"' => '&quote;');
  12 my %NUM2DEFAULT_ENT= ( '38' => 'amp', '60' => 'lt', '62' => 'gt', '"' => '&quote;');
  13
  14 package HTML::TreeBuilder::XPath;
  15
  16 use base( 'HTML::TreeBuilder');
  17
  18
  19 package HTML::TreeBuilder::XPath::Node;
  20
  21 sub isElementNode   { 0 }
  22 sub isAttributeNode { 0 }
  23 sub isNamespaceNode { 0 }
  24 sub isTextNode      { 0 }
  25 sub isProcessingInstructionNode { 0 }
  26 sub isPINode        { 0 }
  27 sub isCommentNode   { 0 }
  28
  29 sub getChildNodes { return wantarray ? () : []; }
  30 sub getFirstChild { return undef; }
  31 sub getLastChild { return undef; }
  32
  33 sub getElementById
  34   { my ($self, $id) = @_;
  35     return scalar $self->look_down( id => $id);
  36   }
  37
  38 sub to_number { return XML::XPathEngine::Number->new( shift->getValue); }
  39
  40 sub cmp
  41   { my( $a, $b)=@_;
  42
  43     # comparison with the root (in $b, or processed in HTML::TreeBuilder::XPath::Root)
  44     if( $b->isa( 'HTML::TreeBuilder::XPath::Root') ) { return -1; }
  45
  46     # easy cases
  47     return  0 if( $a == $b);
  48     return  1 if( $a->is_inside($b)); # a starts after b
  49     return -1 if( $b->is_inside($a)); # a starts before b
  50
  51     # lineage does not include the element itself
  52     my @a_pile= ($a, $a->lineage);
  53     my @b_pile= ($b, $b->lineage);
  54
  55     # the 2 elements are not in the same twig
  56     unless( $a_pile[-1] == $b_pile[-1])
  57       { warn "2 nodes not in the same pile: ", ref( $a), " - ", ref( $b), "\n";
  58         print "a: ", $a->string_value, "\nb: ", $b->string_value, "\n";
  59         return undef;
  60       }
  61
  62     # find the first non common ancestors (they are siblings)
  63     my $a_anc= pop @a_pile;
  64     my $b_anc= pop @b_pile;
  65
  66     while( $a_anc == $b_anc)
  67       { $a_anc= pop @a_pile;
  68         $b_anc= pop @b_pile;
  69       }
  70
  71     if( defined( $a_anc->{_rank}) && defined( $b_anc->{_rank}))
  72       { return $a_anc->{_rank} <=> $b_anc->{_rank}; }
  73     else
  74       {
  75         # from there move left and right and figure out the order
  76         my( $a_prev, $a_next, $b_prev, $b_next)= ($a_anc, $a_anc, $b_anc, $b_anc);
  77         while()
  78           { $a_prev= $a_prev->getPreviousSibling || return -1;
  79             return  1 if( $a_prev == $b_anc);
  80             $a_next= $a_next->getNextSibling     || return  1;
  81             return -1 if( $a_next == $b_anc);
  82             $b_prev= $b_prev->getPreviousSibling || return  1;
  83             return -1 if( $b_prev == $a_next);
  84             $b_next= $b_next->getNextSibling     || return -1;
  85             return  1 if( $b_next == $a_prev);
  86           }
  87       }
  88   }
  89
  90
  91 # need to modify directly the HTML::Element package, because HTML::TreeBuilder won't let me
  92 # change the class of the nodes it generates
  93 package HTML::Element;
  94 use Scalar::Util qw(weaken);
  95 use vars qw(@ISA);
  96
  97 push @ISA, 'HTML::TreeBuilder::XPath::Node';
  98
  99 use XML::XPathEngine;
 100
 101 { my $xp;
 102   sub xp
 103     { $xp ||=XML::XPathEngine->new();
 104       return $xp;
 105     }
 106 }
 107
 108 sub findnodes            { my( $elt, $path)= @_; return xp->findnodes(            $path, $elt); }
 109 sub findnodes_as_string  { my( $elt, $path)= @_; return xp->findnodes_as_string(  $path, $elt); }
 110 sub findnodes_as_strings { my( $elt, $path)= @_; return xp->findnodes_as_strings( $path, $elt); }
 111 sub findvalue            { my( $elt, $path)= @_; return xp->findvalue(            $path, $elt); }
 112 sub exists               { my( $elt, $path)= @_; return xp->exists(               $path, $elt); }
 113 sub find_xpath           { my( $elt, $path)= @_; return xp->find(                 $path, $elt); }
 114 sub matches              { my( $elt, $path)= @_; return xp->matches( $elt, $path, $elt);        }
 115 sub set_namespace        { my $elt= shift; xp->new->set_namespace( @_); }
 116
 117 sub getRootNode
 118   { my $elt= shift;
 119     # The parent of root is a HTML::TreeBuilder::XPath::Root
 120     # that helps getting the tree to mimic a DOM tree
 121     return $elt->root->getParentNode; # I like this one!
 122   }
 123
 124 sub getParentNode
 125   { my $elt= shift;
 126     return $elt->{_parent} || bless { _root => $elt }, 'HTML::TreeBuilder::XPath::Root';
 127   }
 128 sub getName             { return shift->tag;   }
 129 sub getNextSibling      { my( $elt)= @_;
 130                           my $parent= $elt->{_parent} || return undef;
 131                           return  $parent->_child_as_object( scalar $elt->right, ($elt->{_rank} || 0) + 1);
 132                         }
 133 sub getPreviousSibling  { my( $elt)= @_;
 134                           my $parent= $elt->{_parent} || return undef;
 135                           return undef unless $elt->{_rank};
 136                           return  $parent->_child_as_object( scalar $elt->left, $elt->{_rank} - 1);
 137                         }
 138 sub isElementNode       { return ref $_[0] && ($_[0]->{_tag}!~ m{^~}) ? 1 : 0; }
 139 sub isCommentNode       { return ref $_[0] && ($_[0]->{_tag} eq '~comment') ? 1 : 0; }
 140 sub isProcessingInstructionNode { return ref $_[0] && ($_[0]->{_tag} eq '~pi') ? 1 : 0; }
 141 sub isTextNode          { return ref $_[0] ? 0 : 1; }
 142
 143 sub getValue
 144   { my $elt= shift;
 145     if( $elt->isCommentNode) { return $elt->{_text}; }
 146     return $elt->as_text;
 147   }
 148
 149 sub getChildNodes
 150   { my $parent= shift;
 151     my $rank=0;
 152     my @children= map { $parent->_child_as_object( $_, $rank++) } $parent->content_list;
 153     return wantarray ? @children : \@children;
 154   }
 155
 156 sub getFirstChild
 157   { my $parent= shift;
 158     my @content= $parent->content_list;
 159     if( @content)
 160       { return $parent->_child_as_object( $content[0], 0); }
 161     else
 162       { return undef; }
 163   }
 164 sub getLastChild
 165   { my $parent= shift;
 166     my @content= $parent->content_list;
 167     if( @content)
 168       { return $parent->_child_as_object( $content[-1], $#content); }
 169     else
 170       { return undef; }
 171   }
 172
 173 sub getAttributes
 174   { my $elt= shift;
 175     my %atts= $elt->all_external_attr;
 176     my $rank=0;
 177     my @atts= map { bless( { _name => $_, _value => $atts{$_},
 178                              _elt => $elt, _rank => $rank++,
 179                            },
 180                                'HTML::TreeBuilder::XPath::Attribute'
 181                          )
 182                   } sort keys %atts;
 183     return wantarray ? @atts : \@atts;
 184   }
 185
 186 sub to_number { return XML::XPathEngine::Number->new( $_[0]->as_text); }
 187 sub string_value
 188   { my $elt= shift;
 189     if( $elt->isCommentNode) { return $elt->{_text}; }
 190     return $elt->as_text;
 191   };
 192
 193 # called on a parent, with a child as second argument and its rank as third
 194 # returns the child if it is already an element, or
 195 # a new HTML::TreeBuilder::XPath::Text element if it is a plain string
 196 sub _child_as_object
 197   { my( $elt, $elt_or_text, $rank)= @_;
 198     return undef unless( defined $elt_or_text);
 199     if( ! ref $elt_or_text)
 200       { # $elt_or_text is a string, turn it into a TextNode object
 201         $elt_or_text= bless { _content => $elt_or_text, _parent => $elt, },
 202                             'HTML::TreeBuilder::XPath::TextNode'
 203                       ;
 204       }
 205     if( ref $rank) { warn "rank is a ", ref( $rank), " elt_or_text is a ", ref( $elt_or_text); }
 206     $elt_or_text->{_rank}= $rank; # used for sorting;
 207     return $elt_or_text;
 208   }
 209
 210 sub toString { return shift->as_XML_clean( @_); }
 211
 212 # produces better looking XML
 213 { my( $indent, %return_before_endtag);
 214   BEGIN
 215     { $indent= '  ';
 216       %return_before_endtag= map { $_ => 1 } qw(html head body script div table tr form ol ul);
 217     }
 218
 219   sub as_XML_clean
 220     { my( $node, $indent_level)= @_;
 221
 222       my $xml= '';
 223       my $wrapping_nl= "\n";
 224
 225       if( !defined( $indent_level)) { $indent_level = 0; $wrapping_nl= ''; }
 226
 227       my $name = $node->{'_tag'};
 228       if( $HTML::Tagset::isKnown{lc $name} && !$HTML::Tagset::isPhraseMarkup{lc $name} && $indent_level > 0)
 229         { $xml.= $wrapping_nl . ($indent x $indent_level); }
 230
 231       if(    $name eq '~literal')     { $xml= _xml_escape_text( $node->{text});                    }
 232       elsif( $name eq '~declaration') { $xml= '<!' . _xml_escape_text( $node->{text}) . '>';       }
 233       elsif( $name eq '~pi')          { $xml= '<?' . _xml_escape_text( $node->{text}) . '?>';      }
 234       elsif( $name eq '~comment')     { $xml= '<--' . _xml_escape_comment( $node->{text}) . '-->'; }
 235       elsif( $HTML::Tagset::isCDATA_Parent{lc $name})
 236         { $xml.= $node->_start_tag;
 237           my $content= $node->{_content} || '';
 238           if( ref $content eq 'ARRAY' || $content->isa( 'ARRAY'))
 239             { $xml .= _xml_escape_cdata( join( '', @$content)); }
 240           if( $return_before_endtag{lc $name}) { $xml.= "\n" . ($indent x $indent_level); }
 241         }
 242       else
 243         { # start tag
 244           $xml.= $node->_start_tag;
 245           my $child_indent_level= $HTML::Tagset{lc $name} ? $indent_level : $indent_level+1;
 246           foreach my $child ($node->content_list)
 247             { if( ref $child) { $xml .= $child->as_XML_clean( $child_indent_level); }
 248               else            { $xml .=  _xml_escape_text( $child); }
 249             }
 250           if( $return_before_endtag{lc $name}) { $xml.= "\n" . ($indent x $indent_level); }
 251         }
 252       $xml .="</$name>" unless $HTML::Tagset::emptyElement{lc $name};
 253       if( $indent_level == 0) { $xml .= $wrapping_nl; }
 254       return $xml;
 255     }
 256 }
 257
 258
 259 sub _start_tag
 260   { my( $node)= @_;
 261     my $name = $node->{'_tag'};
 262     my $start_tag.= "<$name";
 263     foreach my $att (sort keys %$node)
 264       { next if( (!length $att) ||  ($att=~ m{^_}) || ($att eq '/') );
 265         $start_tag .= qq{ $att="} . _xml_escape_attribute_value $node->{$att} . qq{"};
 266       }
 267     $start_tag.= $HTML::Tagset::emptyElement{lc $name} ? " />" : ">";
 268     return $start_tag;
 269   }
 270
 271 sub _indent_level
 272   { my( $node)= @_;
 273     my $level= scalar grep { !$HTML::Tagset::isPhraseMarkup{lc $_->{_tag}} } $node->lineage;
 274     return $level;
 275   }
 276
 277
 278 sub _xml_escape_attribute_value
 279   { my( $text)= @_;
 280     $text=~ s{([&<>"])}{$CHAR2DEFAULT_ENT{$1}}g; # escape also quote, as it is the attribute separator
 281     return $text;
 282   }
 283
 284 sub _xml_escape_text
 285   { my( $text)= @_;
 286     $text=~ s{([&<>])}{$CHAR2DEFAULT_ENT{$1}}g;
 287     return $text;
 288   }
 289
 290 sub _xml_escape_comment
 291   { my( $text)= @_;
 292     $text=~ s{([&<>])}{$CHAR2DEFAULT_ENT{$1}}g;
 293     $text=~ s{--}{-&#45;}g; # can't have double --'s in XML comments
 294     return $text;
 295   }
 296
 297 sub _xml_escape_cdata
 298   { my( $text)= @_;
 299     $text=~ s{^\s*\Q<![CDATA[}{}s;
 300     $text=~ s{\Q]]>\E\s*$}{}s;
 301     $text=~ s{]]>}{]]&#62;}g; # can't have]]> in CDATA
 302     $text=  "<![CDATA[$text]]>";
 303     return $text;
 304   }
 305
 306
 307 package HTML::TreeBuilder::XPath::TextNode;
 308
 309 use base 'HTML::TreeBuilder::XPath::Node';
 310
 311 sub getParentNode { return shift->{_parent};    }
 312 sub getValue      { return shift->{_content};   }
 313 sub isTextNode    { return 1;                   }
 314 sub getAttributes { return wantarray ? () : []; }
 315
 316 # similar to HTML::Element as_XML
 317 sub as_XML
 318   { my( $node, $entities)= @_;
 319     my $content= $node->{_content};
 320     if( $node->{_parent} && $node->{_parent}->{_tag} eq 'script')
 321       { $content=~ s{(&\w+;)}{HTML::Entities::decode($1)}eg; }
 322     else
 323       { HTML::Element::_xml_escape($content); }
 324     return $content;
 325   }
 326
 327 sub as_XML_clean
 328   { my( $node, $entities)= @_;
 329     my $content= $node->{_content};
 330     if( $node->{_parent} && $node->{_parent}->{_tag} eq 'script')
 331       { $content=~ s{(&\w+;)}{HTML::Entities::decode($1)}eg; }
 332     else
 333       { $content= HTML::Element::_xml_escape_text($content); }
 334     return $content;
 335   }
 336
 337 sub getPreviousSibling
 338   { my $self= shift;
 339     my $rank= $self->{_rank};
 340     #unless( defined $self->{_rank})
 341     #  { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
 342     my $parent= $self->{_parent};
 343     return $rank ? $parent->_child_as_object( $parent->{_content}->[$rank-1], $rank-1) : undef;
 344   }
 345
 346 sub getNextSibling
 347   { my $self= shift;
 348     my $rank= $self->{_rank};
 349     #unless( defined $self->{_rank})
 350     #  { warn "no rank for text node $self->{_content}, parent is $self->{_parent}->{_tag}\n"; }
 351     my $parent= $self->{_parent};
 352     my $next_sibling= $parent->{_content}->[$rank+1];
 353     return defined( $next_sibling) ? $parent->_child_as_object( $next_sibling, $rank+1) : undef;
 354   }
 355
 356 sub getRootNode
 357   { return shift->{_parent}->getRootNode; }
 358
 359 sub string_value { return shift->{_content}; }
 360
 361 # added to provide element-like methods to text nodes, for use by cmp
 362 sub lineage
 363   { my( $node)= @_;
 364     my $parent= $node->{_parent};
 365     return( $parent, $parent->lineage);
 366   }
 367
 368 sub is_inside
 369   { my( $text, $node)= @_;
 370     return $text->{_parent}->is_inside( $node);
 371   }
 372
 373 1;
 374
 375
 376 package HTML::TreeBuilder::XPath::Attribute;
 377 use base 'HTML::TreeBuilder::XPath::Node';
 378
 379 sub getParentNode   { return $_[0]->{_elt}; }
 380 sub getValue        { return $_[0]->{_value}; }
 381 sub getName         { return $_[0]->{_name} ; }
 382 sub getLocalName    { (my $name= $_[0]->{_name}) =~ s{^.*:}{}; $name; }
 383 sub string_value    { return $_[0]->{_value}; }
 384 sub to_number       { return XML::XPathEngine::Number->new( $_[0]->{_value}); }
 385 sub isAttributeNode { 1 }
 386 sub toString        { return qq{$_[0]->{_name}="$_[0]->{_value}"}; }
 387
 388 # awfully inefficient, but hopefully this is called only for weird (read test-case) queries
 389 sub getPreviousSibling
 390   { my $self= shift;
 391     my $rank= $self->{_rank};
 392     return undef unless $rank;
 393     my %atts= $self->{_elt}->all_external_attr;
 394     my $previous_att_name= (sort keys %atts)[$rank-1];
 395     return bless( { _name => $previous_att_name,
 396                              _value => $atts{$previous_att_name},
 397                              _elt => $self->{_elt}, _rank => $rank-1,
 398                    }, 'HTML::TreeBuilder::XPath::Attribute'
 399                 );
 400   }
 401
 402 sub getNextSibling
 403   { my $self= shift;
 404     my $rank= $self->{_rank};
 405     my %atts= $self->{_elt}->all_external_attr;
 406     my $next_att_name= (sort keys %atts)[$rank+1] || return undef;
 407     return bless( { _name => $next_att_name, _value => $atts{$next_att_name},
 408                              _elt => $self->{_elt}, _rank => $rank+1,
 409                    }, 'HTML::TreeBuilder::XPath::Attribute'
 410                 );
 411
 412   }
 413
 414
 415
 416 # added to provide element-like methods to attributes, for use by cmp
 417 sub lineage
 418   { my( $att)= @_;
 419     my $elt= $att->{_elt};
 420     return( $elt, $elt->lineage);
 421   }
 422
 423 sub is_inside
 424   { my( $att, $node)= @_;
 425     return ($att->{_elt} == $node) || $att->{_elt}->is_inside( $node);
 426   }
 427
 428 1;
 429
 430
 431 package HTML::TreeBuilder::XPath::Root;
 432
 433 use base 'HTML::TreeBuilder::XPath::Node';
 434
 435 sub getParentNode   { return (); }
 436 sub getChildNodes   { my @content= ( $_[0]->{_root}); return wantarray ? @content : \@content; }
 437 sub getAttributes   { return []        }
 438 sub isDocumentNode  { return 1         }
 439
 440 # added to provide element-like methods to root, for use by cmp
 441 sub lineage {  return ($_[0]); }
 442 sub is_inside { return 0; }
 443 sub cmp { return $_[1]->isa( ' HTML::TreeBuilder::XPath::Root') ? 0 : 1; }
 444
 445 1;
 446
 447 __END__
 448 =head1 NAME
 449
 450 HTML::TreeBuilder::XPath - add XPath support to HTML::TreeBuilder
 451
 452 =head1 SYNOPSIS
 453
 454   use HTML::TreeBuilder::XPath;
 455   my $tree= HTML::TreeBuilder::XPath->new;
 456   $tree->parse_file( "mypage.html");
 457   my $nb=$tree->findvalue( '/html/body//p[@class="section_title"]/span[@class="nb"]');
 458   my $id=$tree->findvalue( '/html/body//p[@class="section_title"]/@id');
 459
 460   my $p= $html->findnodes( '//p[@id="toto"]')->[0];
 461   my $link_texts= $p->findvalue( './a'); # the texts of all a elements in $p
 462
 463
 464 =head1 DESCRIPTION
 465
 466 This module adds typical XPath methods to HTML::TreeBuilder, to make it
 467 easy to query a document.
 468
 469 =head1 METHODS
 470
 471 Extra methods added both to the tree object and to each element:
 472
 473 =head2 findnodes ($path)
 474
 475 Returns a list of nodes found by C<$path>.
 476 In scalar context returns an C<Tree::XPathEngine::NodeSet> object.
 477
 478 =head2 findnodes_as_string ($path)
 479
 480 Returns the text values of the nodes, as one string.
 481
 482 =head2 findnodes_as_strings ($path)
 483
 484 Returns a list of the values of the result nodes.
 485
 486 =head2 findvalue ($path)
 487
 488 Returns either a C<Tree::XPathEngine::Literal>, a C<Tree::XPathEngine::Boolean>
 489 or a C<Tree::XPathEngine::Number> object. If the path returns a NodeSet,
 490 $nodeset->xpath_to_literal is called automatically for you (and thus a
 491 C<Tree::XPathEngine::Literal> is returned). Note that
 492 for each of the objects stringification is overloaded, so you can just
 493 print the value found, or manipulate it in the ways you would a normal
 494 perl value (e.g. using regular expressions).
 495
 496 =head2 exists ($path)
 497
 498 Returns true if the given path exists.
 499
 500 =head2 matches($path)
 501
 502 Returns true if the element matches the path.
 503
 504 =head2 find ($path)
 505
 506 The find function takes an XPath expression (a string) and returns either a
 507 Tree::XPathEngine::NodeSet object containing the nodes it found (or empty if
 508 no nodes matched the path), or one of XML::XPathEngine::Literal (a string),
 509 XML::XPathEngine::Number, or XML::XPathEngine::Boolean. It should always
 510 return something - and you can use ->isa() to find out what it returned. If
 511 you need to check how many nodes it found you should check $nodeset->size.
 512 See L<XML::XPathEngine::NodeSet>.
 513
 514 =head2 as_XML_clean ($optional_indent_level)
 515
 516 HTML::TreeBuilder's C<as_XML> output is not really nice to look at, so
 517 I added a new method, that can be used as a simple replacement for it.
 518 It escapes only the '<', '>' and '&' (plus '"' in attribute values), and
 519 wraps CDATA elements in CDATA sections.
 520
 521 The C<$optional_indent_level> defaults to the level in the original HTML
 522 document (ie you probably don't have to use it)
 523
 524 This method is currently in alpha state. Ping me if you want other options added
 525 to it (wrapping?).
 526
 527 =head1 SEE ALSO
 528
 529 L<HTML::TreeBuilder>
 530
 531 L<XML::XPathEngine>
 532
 533 =head1 AUTHOR
 534
 535 Michel Rodriguez, E<lt>mirod@cpan.orgE<gt>
 536
 537 =head1 COPYRIGHT AND LICENSE
 538
 539 Copyright (C) 2006 by Michel Rodriguez
 540
 541 This library is free software; you can redistribute it and/or modify
 542 it under the same terms as Perl itself, either Perl version 5.8.4 or,
 543 at your option, any later version of Perl 5 you may have available.
 544
 545
 546 =cut