Added interwiki test
[gpy.git] / lib / WikiLinkParser.pm
blobd2c4f4ea1c4182d2bae4001a9bc0e26873cc6b97
1 package WikiLinkParser;
3 use MediaWiki::API;
4 use strict;
5 use warnings;
6 use HTML::TreeBuilder 5 -weak;
7 use URI::Escape;
8 use Data::Dumper;
10 =head2 get_url_by_text
12 =head3 Input
14 - a string of the wiki text, $wiki_text
15 - a mediawiki api object, $mw
17 =cut
19 sub get_urls_by_text{
20 my $self = shift;
21 my $text = shift;
22 my $mw = shift;
23 my @urls = ();
24 # wiki text to html
25 while ($text =~ m{\[\[(.*?)\]\]}g) {
26 push @urls, $self->_parse($1,$mw);
28 while ($text =~ m<{{(.*?)[\||}}]>g) {
29 push @urls, $self->_parse("Template:$1",$mw);
31 return \@urls;
34 sub uri_unescape_utf8 { my ($str) = @_; $str = uri_unescape $str; utf8::decode $str; $str }
36 sub _parse{
37 my $self = shift;
38 my $text = shift;
39 my $mw = shift;
40 my $info_ref = $mw->api ( {
41 action => 'parse',
42 prop => 'text',
43 text => "{{fullurl:$text}}",
44 } ) or die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
45 my $html = $info_ref->{parse}{text}{'*'};
46 # parse html
47 my $tree = HTML::TreeBuilder->new_from_content($html);
48 my $url = uri_unescape_utf8 ('https:'.$tree->look_down('_tag','p')->content_array_ref->[0]);
49 return $url;