lib/WikiLinkParser.pm

   1 package WikiLinkParser;
   2
   3 use MediaWiki::API;
   4 use strict;
   5 use warnings;
   6 use HTML::TreeBuilder 5 -weak;
   7 use URI::Escape;
   8 use Data::Dumper;
   9
  10 =head2 get_url_by_text
  11
  12 =head3 Input
  13
  14 - a string of the wiki text, $wiki_text
  15 - a mediawiki api object, $mw
  16
  17 =cut
  18
  19 sub get_urls_by_text{
  20     my $self = shift;
  21     my $text = shift;
  22     my $mw = shift;
  23     my @urls = ();
  24     # wiki text to html
  25     while ($text =~ m{\[\[(.*?)\]\]}g) {
  26         push @urls, $self->_parse($1,$mw);
  27     }
  28     while ($text =~ m<{{(.*?)[\||}}]>g) {
  29         push @urls, $self->_parse("Template:$1",$mw);
  30     }
  31     return \@urls;
  32 }
  33
  34 sub uri_unescape_utf8 { my ($str) = @_; $str = uri_unescape $str; utf8::decode $str; $str }
  35
  36 sub _parse{
  37     my $self = shift;
  38     my $text = shift;
  39     my $mw = shift;
  40     my $info_ref = $mw->api ( {
  41         action      => 'parse',
  42         prop        => 'text',
  43         text        => "{{fullurl:$text}}",
  44     } ) or die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
  45     my $html = $info_ref->{parse}{text}{'*'};
  46     # parse html
  47     my $tree = HTML::TreeBuilder->new_from_content($html);
  48     my $url = uri_unescape_utf8 ('https:'.$tree->look_down('_tag','p')->content_array_ref->[0]);
  49     return $url;
  50 }
  51
  52 1;
  53