1 package CPAN
::Testers
::ParseReport
;
6 use DateTime
::Format
::Strptime
;
7 use File
::Basename
qw(basename);
8 use File
::Path
qw(mkpath);
9 use HTML
::Entities
qw(decode_entities);
11 use List
::Util
qw(max sum);
13 use XML
::LibXML
::XPathContext
;
15 our $default_ctformat = "html";
16 our $default_cturl = "http://www.cpantesters.org/show";
23 CPAN::Testers::ParseReport - parse reports to www.cpantesters.org from various sources
27 use version
; our $VERSION = qv
('0.0.13');
31 The documentation in here is normally not needed because the code is
32 meant to be run from a standalone program, L<ctgetreports>.
34 ctgetreports --q mod:Moose Devel-Events
38 This is the core module for CPAN::Testers::ParseReport. If you're not
39 looking to extend or alter the behaviour of this module, you probably
40 want to look at L<ctgetreports> instead.
44 Are described in the L<ctgetreports> manpage and are passed through to
45 the functions unaltered.
49 =head2 parse_distro($distro,$options)
51 reads the cpantesters HTML page or the YAML file for the distro and
52 loops through the reports for the specified or most recent version of
53 that distro found in these data.
55 =head2 parse_single_report($report,$dumpvars,$options)
57 mirrors and reads this report. $report is of the for
61 $dumpvar is a hashreference that gets filled with data.
69 $ua = LWP
::UserAgent
->new
82 $xp = XML
::LibXML
->new;
84 $xp->clean_namespaces(1);
85 my $catalog = __FILE__
;
86 $catalog =~ s
|ParseReport
.pm
$|ParseReport
/catalog
|;
87 $xp->load_catalog($catalog);
92 sub _download_overview
{
93 my($cts_dir, $distro, %Opt) = @_;
94 my $format = $Opt{ctformat
} ||= $default_ctformat;
95 my $cturl = $Opt{cturl
} ||= $default_cturl;
96 my $ctarget = "$cts_dir/$distro.$format";
97 my $cheaders = "$cts_dir/$distro.headers";
99 unless (-e
$ctarget) {
100 die "Alert: No local file '$ctarget' found, cannot continue\n";
103 if (! -e
$ctarget or -M
$ctarget > .25) {
104 if (-e
$ctarget && $Opt{verbose
}) {
106 my $timestamp = gmtime $stat[9];
107 print STDERR
"(timestamp $timestamp GMT)\n" unless $Opt{quiet
};
109 print STDERR
"Fetching $ctarget..." if $Opt{verbose
} && !$Opt{quiet
};
110 my $uri = "$cturl/$distro.$format";
111 my $resp = _ua
->mirror($uri,$ctarget);
112 if ($resp->is_success) {
113 print STDERR
"DONE\n" if $Opt{verbose
} && !$Opt{quiet
};
114 open my $fh, ">", $cheaders or die;
115 for ($resp->headers->as_string) {
117 if ($Opt{verbose
} && $Opt{verbose
}>1) {
118 print STDERR
$_ unless $Opt{quiet
};
121 } elsif (304 == $resp->code) {
122 print STDERR
"DONE (not modified)\n" if $Opt{verbose
} && !$Opt{quiet
};
123 my $atime = my $mtime = time;
124 utime $atime, $mtime, $cheaders;
128 "No success downloading %s: %s",
139 my($ctarget, %Opt) = @_;
140 my $content = do { open my $fh, $ctarget or die; local $/; <$fh> };
141 my $preprocesswithtreebuilder = 0; # not needed since barbie switched to XHTML
142 if ($preprocesswithtreebuilder) {
143 require HTML
::TreeBuilder
;
144 my $tree = HTML
::TreeBuilder
->new;
145 $tree->implicit_tags(1);
147 $tree->ignore_ignorable_whitespace(0);
148 $tree->parse_content($content);
150 $content = $tree->as_XML;
153 my $doc = eval { $parser->parse_string($content) };
156 my $distro = basename
$ctarget;
157 die sprintf "Error while parsing %s\: %s", $distro, $err;
159 my $xc = XML
::LibXML
::XPathContext
->new($doc);
160 my $nsu = $doc->documentElement->namespaceURI;
161 $xc->registerNs('x', $nsu) if $nsu;
162 my($selected_release_ul,$selected_release_distrov,$excuse_string);
165 $xc->findnodes("/x:html/x:body/x:div[\@id = 'doc']") :
166 $doc->findnodes("/html/body/div[\@id = 'doc']");
167 my(@releasedivs) = $nsu ?
168 $xc->findnodes("//x:div[x:h2 and x:ul]",$cparentdiv) :
169 $cparentdiv->findnodes("//div[h2 and ul]");
172 $excuse_string = "selected distro '$Opt{vdistro}'";
173 my($fallbacktoversion) = $Opt{vdistro
} =~ /(\d+\..*)/;
174 RELEASE
: for my $i (0..$#releasedivs) {
177 $xc->findvalue("x:h2/x:a[2]/\@name",$releasedivs[$i]) :
178 $releasedivs[$i]->findvalue("h2/a[2]/\@name");
181 if ($x eq $Opt{vdistro
}) {
183 $picked = " (picked)";
185 print STDERR
"FOUND DISTRO: $x$picked\n" unless $Opt{quiet
};
188 $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$i]) :
189 $releasedivs[$i]->findvalue("h2/a[1]/\@name");
190 if ($x eq $fallbacktoversion) {
192 $picked = " (picked)";
194 print STDERR
"FOUND VERSION: $x$picked\n" unless $Opt{quiet
};
198 $excuse_string = "any distro";
200 unless (defined $releasediv) {
204 # using a[1] because a[2] is missing on the first entry
205 ($selected_release_distrov) = $nsu ?
206 $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$releasediv]) :
207 $releasedivs[$releasediv]->findvalue("h2/a[1]/\@name");
208 ($selected_release_ul) = $nsu ?
209 $xc->findnodes("x:ul",$releasedivs[$releasediv]) :
210 $releasedivs[$releasediv]->findnodes("ul");
211 unless (defined $selected_release_distrov) {
212 warn "Warning: could not find $excuse_string in '$ctarget'";
215 print STDERR
"SELECTED: $selected_release_distrov\n" unless $Opt{quiet
};
219 $xc->findnodes("x:li",$selected_release_ul) :
220 $selected_release_ul->findnodes("li")) {
222 $xc->findvalue("x:a[1]/text()",$test) :
223 $test->findvalue("a[1]/text()");
224 push @all, {id
=>$id};
231 my($ctarget, %Opt) = @_;
233 my $arr = YAML
::Syck
::LoadFile
($ctarget);
234 my($selected_release_ul,$selected_release_distrov,$excuse_string);
236 $excuse_string = "selected distro '$Opt{vdistro}'";
237 $arr = [grep {$_->{distversion
} eq $Opt{vdistro
}} @
$arr];
238 ($selected_release_distrov) = $arr->[0]{distversion
};
240 $excuse_string = "any distro";
243 for my $report (@
$arr) {
244 unless ($seen{$report->{distversion
}}++) {
245 $last_addition = $report->{distversion
};
248 $arr = [grep {$_->{distversion
} eq $last_addition} @
$arr];
249 ($selected_release_distrov) = $last_addition;
251 unless ($selected_release_distrov) {
252 warn "Warning: could not find $excuse_string in '$ctarget'";
255 print STDERR
"SELECTED: $selected_release_distrov\n" unless $Opt{quiet
};
257 for my $test (@
$arr) {
258 my $id = $test->{id
};
259 push @all, {id
=>$id};
262 @all = sort { $b->{id
} <=> $a->{id
} } @all;
266 sub parse_single_report
{
267 my($report, $dumpvars, %Opt) = @_;
268 my($id) = $report->{id
};
269 $Opt{cachedir
} ||= "$ENV{HOME}/var/cpantesters";
270 my $nnt_dir = "$Opt{cachedir}/nntp-testers";
272 my $target = "$nnt_dir/$id";
274 unless (-e
$target) {
275 warn "Warning: No local file '$target' found, skipping\n";
280 print STDERR
"Fetching $target..." if $Opt{verbose
} && !$Opt{quiet
};
281 my $resp = _ua
->mirror("http://www.nntp.perl.org/group/perl.cpan.testers/$id",$target);
282 if ($resp->is_success) {
284 my(@stat) = stat $target;
285 my $timestamp = gmtime $stat[9];
286 print STDERR
"(timestamp $timestamp GMT)\n" unless $Opt{quiet
};
287 if ($Opt{verbose
} > 1) {
288 print STDERR
$resp->headers->as_string unless $Opt{quiet
};
291 my $headers = "$target.headers";
292 open my $fh, ">", $headers or die;
293 print $fh $resp->headers->as_string;
295 die $resp->status_line;
299 parse_report
($target, $dumpvars, %Opt);
303 my($distro,%Opt) = @_;
305 $Opt{cachedir
} ||= "$ENV{HOME}/var/cpantesters";
306 my $cts_dir = "$Opt{cachedir}/cpantesters-show";
309 require Statistics
::Regression
;
310 $Opt{dumpvars
} = "." unless defined $Opt{dumpvars
};
311 $Opt{quiet
} = 1 unless defined $Opt{quiet
};
313 my $ctarget = _download_overview
($cts_dir, $distro, %Opt);
315 $Opt{ctformat
} ||= $default_ctformat;
316 if ($Opt{ctformat
} eq "html") {
317 $reports = _parse_html
($ctarget,%Opt);
319 $reports = _parse_yaml
($ctarget,%Opt);
321 return unless $reports;
322 for my $report (@
$reports) {
323 parse_single_report
($report, \
%dumpvars, %Opt);
326 if ($Opt{dumpvars
}) {
328 my $dumpfile = $Opt{dumpfile
} || "ctgetreports.out";
329 open my $fh, ">", $dumpfile or die "Could not open '$dumpfile' for writing: $!";
330 print $fh YAML
::Syck
::Dump
(\
%dumpvars);
331 close $fh or die "Could not close '$dumpfile': $!"
338 =head2 parse_report($target,$dumpvars,%Opt)
340 Reads one report. $target is the local filename to read. $dumpvars is
341 a hashref which gets filled. %Opt are the options as described in the
342 C<ctgetreports> manpage.
344 Note: this parsing is a bit dirty but as it seems good enough I'm not
345 inclined to change it. We parse HTML with a regexps only, no HTML
346 parser working, only the entities are decoded.
350 my($target,$dumpvars,%Opt) = @_;
352 my $id = basename
($target);
358 my @qr = map /^qr:(.+)/, @
{$Opt{q
}};
359 if ($Opt{raw
} || @qr) {
360 open my $fh, $target or die "Could not open '$target': $!";
362 $report = decode_entities
<$fh>;
365 my $cqr = eval "qr{$qr}";
366 die "Could not compile regular expression '$qr': $@" if $@
;
367 my(@matches) = $report =~ $cqr;
373 $v = join "", map {"($_)"} @matches;
378 $extract{"qr:$qr"} = $v;
382 open my $fh, $target or die "Could not open '$target': $!";
385 my $moduleunpack = {};
386 my $expect_prereq = 0;
387 my $expect_toolchain = 0;
388 my $expecting_toolchain_soon = 0;
391 my $in_prg_output = 0;
392 my $in_env_context = 0;
394 my $current_headline;
395 my @previous_line = ""; # so we can neutralize line breaks
396 LINE
: while (<$fh>) {
397 next unless /<title>(\S+)\s+(\S+)/;
400 $extract{"meta:ok"} = $ok;
401 $extract{"meta:about"} = $about;
405 LINE
: while (<$fh>) {
408 my $followupline = <$fh>;
409 $followupline =~ s/^\s+//; # remo leading space
413 $_ = decode_entities
$_;
414 if (/^--------/ && $previous_line[-2] && $previous_line[-2] =~ /^--------/) {
415 $current_headline = $previous_line[-1];
416 if ($current_headline =~ /PROGRAM OUTPUT/) {
421 if ($current_headline =~ /ENVIRONMENT AND OTHER CONTEXT/) {
427 unless ($extract{"meta:perl"}) {
430 } elsif (/Summary of my perl5 \((.+)\) configuration:/) {
437 if (($r,$v,$s,$p) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+) patch (\S+)/) {
438 $r =~ s/\.0//; # 5.0 6 2!
439 $extract{"meta:perl"} = "$r.$v.$s\@$p";
440 } elsif (($r,$v,$s) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+)/) {
442 $extract{"meta:perl"} = "$r.$v.$s";
443 } elsif (($r,$v,$s) = $p5 =~ /(\d+\S*) patchlevel (\S+) subversion (\S+)/) {
445 $extract{"meta:perl"} = "$r.$v.$s";
447 $extract{"meta:perl"} = $p5;
451 unless ($extract{"meta:from"}) {
453 } elsif (m
|<div
class="h_name">From
:</div> <b>(.+?)</b
><br
/>|) {
454 $extract{"meta:from"} = $1;
456 $extract{"meta:from"} =~ s/\.$// if $extract{"meta:from"};
458 unless ($extract{"meta:date"}) {
460 } elsif (m
|<div
class="h_name">Date
:</div> (.+?)<br/>|) {
462 my $p = DateTime
::Format
::Strptime
->new(
465 # April 13, 2005 23:50
466 pattern
=> "%b %d, %Y %R",
468 my $dt = $p->parse_datetime($date);
469 $extract{"meta:date"} = $dt->datetime;
471 $extract{"meta:date"} =~ s/\.$// if $extract{"meta:date"};
473 unless ($extract{"meta:writer"}) {
474 for ("$previous_line[-1] $_") {
476 } elsif (/created (?:automatically )?by (\S+)/) {
477 $extract{"meta:writer"} = $1;
478 } elsif (/CPANPLUS, version (\S+)/) {
479 $extract{"meta:writer"} = "CPANPLUS $1";
480 } elsif (/This report was machine-generated by (\S+) (\S+)/) {
481 $extract{"meta:writer"} = "$1 $2";
483 $extract{"meta:writer"} =~ s/[\.,]$// if $extract{"meta:writer"};
487 # we do that first three lines a bit too often
488 my $qr = $Opt{dumpvars
} || "";
489 $qr = qr/$qr/ if $qr;
492 @q = qw(meta:perl conf:archname conf:usethreads conf:optimize meta:writer meta:from) unless @q;
495 my %conf_vars = map {($_ => 1)} grep { /^conf:/ } @q;
497 if (/^\s*$/ || m
|</pre
>|) {
500 my(%kv) = /\G,?\s*([^=]+)=('[^']+?'|\S+)/gc;
501 while (my($k,$v) = each %kv) {
504 if ($v =~ /^'(.*)'$/) {
509 if ($qr && $ck =~ $qr) {
510 $dumpvars->{$ck}{$v}{$ok}++;
512 } elsif ($conf_vars{$ck}) {
518 if ($in_prg_output) {
519 unless ($extract{"meta:output_from"}) {
520 if (/Output from (.+):$/) {
521 $extract{"meta:output_from"} = $1
525 if ($in_env_context) {
526 if (/^\s{4}(\S+)\s*=\s*(.*)$/) {
527 $extract{"env:$1"} = $2;
530 push @previous_line, $_;
531 if ($expect_prereq || $expect_toolchain) {
532 if (exists $moduleunpack->{type
}) {
534 if ($moduleunpack->{type
} == 1) {
535 (my $leader,$module,undef,$v) = eval { unpack $moduleunpack->{tpl
}, $_; };
537 if ($leader =~ /^-/) {
541 } elsif ($leader =~ /^(
542 buil
# build_requires:
545 } elsif ($module =~ /^(
550 } elsif ($moduleunpack->{type
} == 2) {
551 (my $leader,$module,$v) = eval { unpack $moduleunpack->{tpl
}, $_; };
553 if ($leader =~ /^\*/) {
558 } elsif ($moduleunpack->{type
} == 3) {
559 (my $leader,$module,$v) = eval { unpack $moduleunpack->{tpl
}, $_; };
563 $expect_toolchain = 0;
565 } elsif ($module =~ /^-/) {
573 $extract{"mod:$module"} = $v;
576 if (/(\s+)(Module\s+)(Need\s+)Have/) {
579 tpl
=> 'a'.length($1).'a'.length($2).'a'.length($3).'a*',
582 } elsif (/(\s+)(Module Name\s+)(Have)(\s+)Want/) {
585 my $adjust_2 = -length($4);
586 my $adjust_3 = length($4);
587 # two pass would be required to see where the
588 # columns really are. Or could we get away with split?
590 tpl
=> 'a'.length($1).'a'.(length($2)+$adjust_2).'a'.(length($3)+$adjust_3),
595 if (/PREREQUISITES|Prerequisite modules loaded/) {
599 if ($expecting_toolchain_soon) {
600 if (/(\s+)(Module\s+) Have/) {
603 $expecting_toolchain_soon=0;
605 tpl
=> 'a'.length($1).'a'.length($2).'a*',
610 if (/toolchain versions installed/) {
612 $expecting_toolchain_soon=1;
616 if (my $qr = $Opt{dumpvars
}) {
618 while (my($k,$v) = each %extract) {
620 $dumpvars->{$k}{$v}{$ok}++;
625 my $have = $extract{$want} || "";
626 $diag .= " $want\[$have]";
630 my $data = $dumpvars->{"==DATA=="} ||= [];
631 push @
$data, \
%extract;
633 printf STDERR
" %-4s %8d%s\n", $ok, $id, $diag unless $Opt{quiet
};
635 $report =~ s/\s+\z//;
636 print STDERR
$report, "\n================\n" unless $Opt{quiet
};
638 if ($Opt{interactive
}) {
642 my $ans = IO
::Prompt
::prompt
644 -p
=> "View $id? [onechar: ynq] ",
649 print STDERR
"\n" unless $Opt{quiet
};
651 open my $ifh, "<", $target or die "Could not open $target: $!";
652 $Opt{pager
} ||= "less";
653 open my $lfh, "|-", $Opt{pager
} or die "Could not fork '$Opt{pager}': $!";
656 close $ifh or die "Could not close $target: $!";
657 close $lfh or die "Could not close pager: $!"
658 } elsif ($ans eq "q") {
673 require Statistics
::Regression
;
675 for my $variable (sort keys %$V) {
676 next if $variable eq "==DATA==";
677 my $value_distribution = $V->{$variable};
678 my $keys = keys %$value_distribution;
681 for my $value (sort keys %$value_distribution) {
682 my $pf = $value_distribution->{$value};
685 my $provers = sum
map {$pf->{$_}} qw(PASS FAIL);
686 push @results, $provers;
687 push @X, "eq_$value";
689 $pf->{PASS
} xor $pf->{FAIL
}
692 substr($value,$vl) = "..." if length $value > 3+$vl;
693 my $poor_mans_freehand_estimation = 0;
694 if ($poor_mans_freehand_estimation) {
697 "%4d %4d %-23s | %s\n",
706 my $results = max
@results;
708 warn "variable[$variable]keys[$keys]results[$results]\n";
709 my $reg = Statistics
::Regression
->new("reg_$variable",\
@X);
710 RECORD
: for my $rec (@
{$V->{"==DATA=="}}) {
712 if ($rec->{"meta:ok"} eq "PASS") {
714 } elsif ($rec->{"meta:ok"} eq "FAIL") {
723 next unless $x =~ /^eq_(.+)/;
725 if (exists $rec->{$variable} && defined $rec->{$variable} && $rec->{$variable} eq $v) {
729 $reg->include($y, \
%obs);
733 # irrelevant observation or something that needs further
734 # tweaking, like date
746 Please report any bugs or feature requests through the web
748 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CPAN-Testers-ParseReport>.
749 I will be notified, and then you'll automatically be notified of
750 progress on your bug as I make changes.
754 You can find documentation for this module with the perldoc command.
756 perldoc CPAN::Testers::ParseReport
759 You can also look for information at:
763 =item * RT: CPAN's request tracker
765 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=CPAN-Testers-ParseReport>
767 =item * AnnoCPAN: Annotated CPAN documentation
769 L<http://annocpan.org/dist/CPAN-Testers-ParseReport>
773 L<http://cpanratings.perl.org/d/CPAN-Testers-ParseReport>
777 L<http://search.cpan.org/dist/CPAN-Testers-ParseReport>
782 =head1 ACKNOWLEDGEMENTS
784 Thanks to RJBS for module-starter.
786 =head1 COPYRIGHT & LICENSE
788 Copyright 2008 Andreas König.
790 This program is free software; you can redistribute it and/or modify it
791 under the same terms as Perl itself.
796 1; # End of CPAN::Testers::ParseReport