add compression support for fetched and cached cpan testers reports
[cpan-testers-parsereport.git] / lib / CPAN / Testers / ParseReport.pm
blob2961a9ee8d853a5aaea57546d84eaec0517fa4d6
1 package CPAN::Testers::ParseReport;
3 use warnings;
4 use strict;
6 use Config::Perl::V ();
7 use DateTime::Format::Strptime;
8 use File::Basename qw(basename);
9 use File::Path qw(mkpath);
10 use HTML::Entities qw(decode_entities);
11 use LWP::UserAgent;
12 use List::Util qw(max min sum);
13 use MIME::QuotedPrint ();
14 use Time::Local ();
15 use XML::LibXML;
16 use XML::LibXML::XPathContext;
18 our $default_ctformat = "yaml";
19 our $default_transport = "http_cpantesters";
20 our $default_cturl = "http://www.cpantesters.org/show";
21 our $Signal = 0;
23 =encoding utf-8
25 =head1 NAME
27 CPAN::Testers::ParseReport - parse reports to www.cpantesters.org from various sources
29 =cut
31 use version; our $VERSION = qv('0.1.15');
33 =head1 SYNOPSIS
35 The documentation in here is normally not needed because the code is
36 meant to be run from the standalone program C<ctgetreports>.
38 ctgetreports --q mod:Moose Devel-Events
40 =head1 DESCRIPTION
42 This is the core module for CPAN::Testers::ParseReport. If you're not
43 looking to extend or alter the behaviour of this module, you probably
44 want to look at L<ctgetreports> instead.
46 =head1 OPTIONS
48 Options are described in the L<ctgetreports> manpage and are passed
49 through to the functions unaltered.
51 =head1 FUNCTIONS
53 =head2 parse_distro($distro,%options)
55 reads the cpantesters HTML page or the YAML file or the local database
56 for the distro and loops through the reports for the specified or most
57 recent version of that distro found in these data.
59 parse_distro() intentionally has no meaningful return value, different
60 options would require different ones.
62 =head2 $extract = parse_single_report($report,$dumpvars,%options)
64 mirrors and reads this report. $report is of the form
66 { id => number }
68 $dumpvar is a hashreference that gets filled with data.
70 $extract is the result of parse_report() described below.
72 =cut
75 my $ua;
76 sub _ua {
77 return $ua if $ua;
78 $ua = LWP::UserAgent->new
80 keep_alive => 1,
81 env_proxy => 1,
83 $ua->parse_head(0);
85 # I would love to support gzipped transfer but it doesn't seem
86 # to mix well with mirroring:
88 # $ua->default_header('Accept-Encoding' => scalar HTTP::Message::decodable());
90 $ua;
95 my $xp;
96 sub _xp {
97 return $xp if $xp;
98 $xp = XML::LibXML->new;
99 $xp->keep_blanks(0);
100 $xp->clean_namespaces(1);
101 my $catalog = __FILE__;
102 $catalog =~ s|ParseReport.pm$|ParseReport/catalog|;
103 $xp->load_catalog($catalog);
104 return $xp;
108 sub _download_overview {
109 my($cts_dir, $distro, %Opt) = @_;
110 my $format = $Opt{ctformat} ||= $default_ctformat;
111 my $cturl = $Opt{cturl} ||= $default_cturl;
112 my $ctarget = "$cts_dir/$distro.$format";
113 my $cheaders = "$cts_dir/$distro.headers";
114 if ($Opt{local}) {
115 unless (-e $ctarget) {
116 die "Alert: No local file '$ctarget' found, cannot continue\n";
118 } else {
119 if (! -e $ctarget or -M $ctarget > .25) {
120 if (-e $ctarget && $Opt{verbose}) {
121 my(@stat) = stat _;
122 my $timestamp = gmtime $stat[9];
123 print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
125 print STDERR "Fetching $ctarget..." if $Opt{verbose} && !$Opt{quiet};
126 my $uri = "$cturl/$distro.$format";
127 my $resp = _ua->mirror($uri,$ctarget);
128 if ($resp->is_success) {
129 print STDERR "DONE\n" if $Opt{verbose} && !$Opt{quiet};
130 open my $fh, ">", $cheaders or die;
131 for ($resp->headers->as_string) {
132 print $fh $_;
133 if ($Opt{verbose} && $Opt{verbose}>1) {
134 print STDERR $_ unless $Opt{quiet};
137 } elsif (304 == $resp->code) {
138 print STDERR "DONE (not modified)\n" if $Opt{verbose} && !$Opt{quiet};
139 my $atime = my $mtime = time;
140 utime $atime, $mtime, $cheaders;
141 } else {
142 die sprintf
144 "No success downloading %s: %s",
145 $uri,
146 $resp->status_line,
151 return $ctarget;
154 sub _parse_html {
155 my($ctarget, %Opt) = @_;
156 my $content = do { open my $fh, $ctarget or die; local $/; <$fh> };
157 my $preprocesswithtreebuilder = 0; # not needed since barbie switched to XHTML
158 if ($preprocesswithtreebuilder) {
159 require HTML::TreeBuilder;
160 my $tree = HTML::TreeBuilder->new;
161 $tree->implicit_tags(1);
162 $tree->p_strict(1);
163 $tree->ignore_ignorable_whitespace(0);
164 $tree->parse_content($content);
165 $tree->eof;
166 $content = $tree->as_XML;
168 my $parser = _xp();
169 my $doc = eval { $parser->parse_string($content) };
170 my $err = $@;
171 unless ($doc) {
172 my $distro = basename $ctarget;
173 die sprintf "Error while parsing %s\: %s", $distro, $err;
175 my $xc = XML::LibXML::XPathContext->new($doc);
176 my $nsu = $doc->documentElement->namespaceURI;
177 $xc->registerNs('x', $nsu) if $nsu;
178 my($selected_release_ul,$selected_release_distrov,$excuse_string);
179 my($cparentdiv)
180 = $nsu ?
181 $xc->findnodes("/x:html/x:body/x:div[\@id = 'doc']") :
182 $doc->findnodes("/html/body/div[\@id = 'doc']");
183 my(@releasedivs) = $nsu ?
184 $xc->findnodes("//x:div[x:h2 and x:ul]",$cparentdiv) :
185 $cparentdiv->findnodes("//div[h2 and ul]");
186 my $releasediv;
187 if ($Opt{vdistro}) {
188 $excuse_string = "selected distro '$Opt{vdistro}'";
189 my($fallbacktoversion) = $Opt{vdistro} =~ /(\d+\..*)/;
190 $fallbacktoversion = 0 unless defined $fallbacktoversion;
191 RELEASE: for my $i (0..$#releasedivs) {
192 my $picked = "";
193 my($x) = $nsu ?
194 $xc->findvalue("x:h2/x:a[2]/\@name",$releasedivs[$i]) :
195 $releasedivs[$i]->findvalue("h2/a[2]/\@name");
196 if ($x) {
197 if ($x eq $Opt{vdistro}) {
198 $releasediv = $i;
199 $picked = " (picked)";
201 print STDERR "FOUND DISTRO: $x$picked\n" unless $Opt{quiet};
202 } else {
203 ($x) = $nsu ?
204 $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$i]) :
205 $releasedivs[$i]->findvalue("h2/a[1]/\@name");
206 if ($x eq $fallbacktoversion) {
207 $releasediv = $i;
208 $picked = " (picked)";
210 print STDERR "FOUND VERSION: $x$picked\n" unless $Opt{quiet};
213 } else {
214 $excuse_string = "any distro";
216 unless (defined $releasediv) {
217 $releasediv = 0;
219 # using a[1] because a[2] is missing on the first entry
220 ($selected_release_distrov) = $nsu ?
221 $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$releasediv]) :
222 $releasedivs[$releasediv]->findvalue("h2/a[1]/\@name");
223 ($selected_release_ul) = $nsu ?
224 $xc->findnodes("x:ul",$releasedivs[$releasediv]) :
225 $releasedivs[$releasediv]->findnodes("ul");
226 unless (defined $selected_release_distrov) {
227 warn "Warning: could not find $excuse_string in '$ctarget'";
228 return;
230 print STDERR "SELECTED: $selected_release_distrov\n" unless $Opt{quiet};
231 my($id);
232 my @all;
233 for my $test ($nsu ?
234 $xc->findnodes("x:li",$selected_release_ul) :
235 $selected_release_ul->findnodes("li")) {
236 $id = $nsu ?
237 $xc->findvalue("x:a[1]/text()",$test) :
238 $test->findvalue("a[1]/text()");
239 push @all, {id=>$id};
240 return if $Signal;
242 return \@all;
245 sub _parse_yaml {
246 my($ctarget, %Opt) = @_;
247 require YAML::Syck;
248 my $arr = YAML::Syck::LoadFile($ctarget);
249 my($selected_release_ul,$selected_release_distrov,$excuse_string);
250 if ($Opt{vdistro}) {
251 $excuse_string = "selected distro '$Opt{vdistro}'";
252 $arr = [grep {$_->{distversion} eq $Opt{vdistro}} @$arr];
253 ($selected_release_distrov) = $arr->[0]{distversion};
254 } else {
255 $excuse_string = "any distro";
256 my $last_addition;
257 my %seen;
258 for my $report (sort { $a->{id} <=> $b->{id} } @$arr) {
259 unless ($seen{$report->{distversion}}++) {
260 $last_addition = $report->{distversion};
263 $arr = [grep {$_->{distversion} eq $last_addition} @$arr];
264 ($selected_release_distrov) = $last_addition;
266 unless ($selected_release_distrov) {
267 warn "Warning: could not find $excuse_string in '$ctarget'";
268 return;
270 print STDERR "SELECTED: $selected_release_distrov\n" unless $Opt{quiet};
271 my @all;
272 for my $test (@$arr) {
273 my $id = $test->{id};
274 push @all, {id=>$id};
275 return if $Signal;
277 @all = sort { $b->{id} <=> $a->{id} } @all;
278 return \@all;
281 sub parse_single_report {
282 my($report, $dumpvars, %Opt) = @_;
283 my($id) = $report->{id};
284 $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
285 my $nnt_dir = "$Opt{cachedir}/nntp-testers";
286 mkpath $nnt_dir;
287 my $target = "$nnt_dir/$id";
288 if ($Opt{local}) {
289 unless (-e $target) {
290 die {severity=>0,text=>"Warning: No local file '$target' found, skipping\n"};
292 } else {
293 if (! -e $target) {
294 print STDERR "Fetching $target..." if $Opt{verbose} && !$Opt{quiet};
295 $Opt{transport} ||= $default_transport;
296 if (0) {
297 } elsif ($Opt{transport} eq "http_cpantesters") {
298 my $resp = _ua->mirror("http://www.cpantesters.org/cgi-bin/pages.cgi?act=cpan-report&raw=1&id=$id",$target);
299 if ($resp->is_success) {
300 if ($Opt{verbose}) {
301 my(@stat) = stat $target;
302 my $timestamp = gmtime $stat[9];
303 print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
304 if ($Opt{verbose} > 1) {
305 print STDERR $resp->headers->as_string unless $Opt{quiet};
308 my $headers = "$target.headers";
309 open my $fh, ">", $headers or die {severity=>1,text=>"Could not open >$headers: $!"};
310 print $fh $resp->headers->as_string;
311 } else {
312 die {severity=>0,
313 text=>sprintf "HTTP Server Error[%s] for id[%s]", $resp->status_line, $id};
315 } elsif ($Opt{transport} eq "http_cpantesters_gzip") {
316 if (-e "$target.gz") {
317 0 == system gunzip => $target or die;
319 my $resp = _ua->mirror("http://www.cpantesters.org/cgi-bin/pages.cgi?act=cpan-report&raw=1&id=$id",$target);
320 if ($resp->is_success) {
321 if ($Opt{verbose}) {
322 my(@stat) = stat $target;
323 my $timestamp = gmtime $stat[9];
324 print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
325 if ($Opt{verbose} > 1) {
326 print STDERR $resp->headers->as_string unless $Opt{quiet};
329 my $headers = "$target.headers";
330 open my $fh, ">", $headers or die {severity=>1,text=>"Could not open >$headers: $!"};
331 print $fh $resp->headers->as_string;
332 0 == system gzip => $target or die;
333 } else {
334 die {severity=>0,
335 text=>sprintf "HTTP Server Error[%s] for id[%s]", $resp->status_line, $id};
337 } else {
338 die {severity=>1,text=>"Illegal value for --transport: '$Opt{transport}'"};
342 parse_report($target, $dumpvars, %Opt);
345 sub parse_distro {
346 my($distro,%Opt) = @_;
347 my %dumpvars;
348 $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
349 my $cts_dir = "$Opt{cachedir}/cpantesters-show";
350 mkpath $cts_dir;
351 if ($Opt{solve}) {
352 require Statistics::Regression;
353 $Opt{dumpvars} = "." unless defined $Opt{dumpvars};
355 if (!$Opt{vdistro} && $distro =~ /^(.+)-(?i:v?\d+)(?:\.\d+)*\w*$/) {
356 $Opt{vdistro} = $distro;
357 $distro = $1;
359 my $reports;
360 if (my $ctdb = $Opt{ctdb}) {
361 require CPAN::WWW::Testers::Generator::Database;
362 require CPAN::DistnameInfo;
363 my $dbi = CPAN::WWW::Testers::Generator::Database->new(database=>$ctdb) or die "Alert: unknown error while opening database '$ctdb'";
364 unless ($Opt{vdistro}) {
365 my $sql = "select version from cpanstats where dist=? order by id";
366 my @rows = $dbi->get_query($sql,$distro);
367 my($newest,%seen);
368 for my $row (@rows) {
369 $newest = $row->[0] unless $seen{$row->[0]}++;
371 $Opt{vdistro} = "$distro-$newest";
373 my $d = CPAN::DistnameInfo->new("FOO/$Opt{vdistro}.tgz");
374 my $dist = $d->dist;
375 my $version = $d->version;
376 my $sql = "select id from cpanstats where dist=? and version=? order by id desc";
377 my @rows = $dbi->get_query($sql,$dist,$version);
378 my @all;
379 for my $row (@rows) {
380 my $id = $row->[0];
381 push @all, {id=>$id};
383 $reports = \@all;
384 } else {
385 my $ctarget = _download_overview($cts_dir, $distro, %Opt);
386 $Opt{ctformat} ||= $default_ctformat;
387 if ($Opt{ctformat} eq "html") {
388 $reports = _parse_html($ctarget,%Opt);
389 } else {
390 $reports = _parse_yaml($ctarget,%Opt);
393 return unless $reports;
394 my $sampled = 0;
395 my $i = 0;
396 my $samplesize = $Opt{sample} || 0;
397 $samplesize = 0 if $samplesize && $samplesize >= @$reports;
398 REPORT: for my $report (@$reports) {
399 $i++;
400 if ($samplesize) {
401 my $need = $samplesize - $sampled;
402 next REPORT unless $need;
403 my $left = @$reports - $i;
404 # warn sprintf "tot %d i %d sampled %d need %d left %d\n", scalar @$reports, $i, $sampled, $need, $left;
405 my $want_this = (rand(1) <= ($need/$left));
406 next REPORT unless $want_this;
408 eval {parse_single_report($report, \%dumpvars, %Opt)};
409 if ($@) {
410 if (ref $@) {
411 if ($@->{severity}) {
412 die $@->{text};
413 } else {
414 warn $@->{text};
416 } else {
417 die $@;
420 $sampled++;
421 last if $Signal;
423 if ($Opt{dumpvars}) {
424 require YAML::Syck;
425 my $dumpfile = $Opt{dumpfile} || "ctgetreports.out";
426 open my $fh, ">", $dumpfile or die "Could not open '$dumpfile' for writing: $!";
427 print $fh YAML::Syck::Dump(\%dumpvars);
428 close $fh or die "Could not close '$dumpfile': $!"
430 if ($Opt{solve}) {
431 solve(\%dumpvars,%Opt);
435 =head2 $bool = _looks_like_qp($raw_report)
437 We had to acknowledge the fact that some MTAs swallow the MIME-Version
438 header while passing MIME through. So we introduce fallback heuristics
439 that try to determine if a report is written in quoted printable.
441 Note that this subroutine is internal, just documented to have the
442 internals documented.
444 The current implementation counts the number of QP escaped spaces and
445 equal signs.
447 =cut
449 sub _looks_like_qp {
450 my($report) = @_;
451 my $count_space = () = $report =~ /=20/g;
452 return 1 if $count_space > 12;
453 my $count_equal = () = $report =~ /=3D/g;
454 return 1 if $count_equal > 12;
455 return 1 if $count_space+$count_equal > 24;
456 return 0; # waiting for a counter example
459 =head2 $extract = parse_report($target,$dumpvars,%Opt)
461 Reads one report. $target is the local filename to read. $dumpvars is
462 a hashref which gets filled with descriptive stats about
463 PASS/FAIL/etc. %Opt are the options as described in the
464 C<ctgetreports> manpage. $extract is a hashref containing the found
465 variables.
467 Note: this parsing is a bit dirty but as it seems good enough I'm not
468 inclined to change it. We parse HTML with regexps only, not an HTML
469 parser. Only the entities are decoded.
471 In %Opt you can use
473 article => $some_full_article_as_scalar
475 to use this function to parse one full article as text. When this is
476 given, the argument $target is not read, but its basename is taken to
477 be the id of the article. (OMG, hackers!)
479 =cut
480 sub parse_report {
481 my($target,$dumpvars,%Opt) = @_;
482 our @q;
483 my $id = basename($target);
484 # warn "DEBUG: id[$id]";
485 my($ok,$about);
487 my(%extract);
489 my($report,$isHTML) = _get_cooked_report($target, \%Opt);
490 my @qr = map /^qr:(.+)/, @{$Opt{q}};
491 if ($Opt{raw} || @qr) {
492 for my $qr (@qr) {
493 my $cqr = eval "qr{$qr}";
494 die "Could not compile regular expression '$qr': $@" if $@;
495 my(@matches) = $report =~ $cqr;
496 my $v;
497 if (@matches) {
498 if (@matches==1) {
499 $v = $matches[0];
500 } else {
501 $v = join "", map {"($_)"} @matches;
503 } else {
504 $v = "";
506 $extract{"qr:$qr"} = $v;
510 my $report_writer;
511 my $moduleunpack = {};
512 my $expect_prereq = 0;
513 my $expect_toolchain = 0;
514 my $expecting_toolchain_soon = 0;
515 my $fallback_p5 = "";
517 my $in_summary = 0;
518 my $in_summary_seen_platform = 0;
519 my $in_prg_output = 0;
520 my $in_env_context = 0;
522 my $current_headline;
523 my @previous_line = ""; # so we can neutralize line breaks
524 my @rlines = split /\r?\n/, $report;
525 LINE: for (@rlines) {
526 next LINE unless ($isHTML ? m/<title>((\S+)\s+(\S+))/ : m/^Subject:\s*((\S+)\s+(\S+))/);
527 my $s = $1;
528 $s = $1 if $s =~ m{<strong>(.+)};
529 if ($s =~ /(\S+)\s+(\S+)/) {
530 $ok = $1;
531 $about = $2;
533 $extract{"meta:ok"} = $ok;
534 $extract{"meta:about"} = $about;
535 last;
537 unless ($extract{"meta:about"}) {
538 $extract{"meta:about"} = $Opt{vdistro};
539 unless ($extract{"meta:ok"}) {
540 $DB::single++;
541 warn "Warning: could not determine state of report";
544 LINE: while (@rlines) {
545 $_ = shift @rlines;
546 while (/!$/ and @rlines) {
547 my $followupline = shift @rlines;
548 $followupline =~ s/^\s+//; # remo leading space
549 $_ .= $followupline;
551 if (/^--------/ && $previous_line[-2] && $previous_line[-2] =~ /^--------/) {
552 $current_headline = $previous_line[-1];
553 if ($current_headline =~ /PROGRAM OUTPUT/) {
554 $in_prg_output = 1;
555 } else {
556 $in_prg_output = 0;
558 if ($current_headline =~ /ENVIRONMENT AND OTHER CONTEXT/) {
559 $in_env_context = 1;
560 } else {
561 $in_env_context = 0;
564 if ($extract{"meta:perl"}) {
565 if ( $in_summary
566 and !$extract{"conf:git_commit_id"}
567 and /Commit id:\s*([[:xdigit:]]+)/) {
568 $extract{"conf:git_commit_id"} = $1;
570 } else {
571 my $p5;
572 if (0) {
573 } elsif (/Summary of my perl5 \((.+)\) configuration:/) {
574 $p5 = $1;
575 $in_summary = 1;
576 $in_env_context = 0;
578 if ($p5) {
579 my($r,$v,$s,$p);
580 if (($r,$v,$s,$p) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+) patch (\S+)/) {
581 $r =~ s/\.0//; # 5.0 6 2!
582 $extract{"meta:perl"} = "$r.$v.$s\@$p";
583 } elsif (($r,$v,$s) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+)/) {
584 $r =~ s/\.0//;
585 $extract{"meta:perl"} = "$r.$v.$s";
586 } elsif (($r,$v,$s) = $p5 =~ /(\d+\S*) patchlevel (\S+) subversion (\S+)/) {
587 $r =~ s/\.0//;
588 $extract{"meta:perl"} = "$r.$v.$s";
589 } else {
590 $extract{"meta:perl"} = $p5;
594 unless ($extract{"meta:from"}) {
595 if (0) {
596 } elsif ($isHTML ?
597 m|<div class="h_name">From:</div> <b>(.+?)</b><br/>| :
598 m|^From:\s*(.+)|
600 my $f = $1;
601 $f = $1 if $f =~ m{<strong>(.+)</strong>};
602 $extract{"meta:from"} = $f;
604 $extract{"meta:from"} =~ s/\.$// if $extract{"meta:from"};
606 unless ($extract{"meta:date"}) {
607 if (0) {
608 } elsif ($isHTML ?
609 m|<div class="h_name">Date:</div> (.+?)<br/>| :
610 m|^Date:\s*(.+)|
612 my $date = $1;
613 $date = $1 if $date =~ m{<strong>(.+)</strong>};
614 my($dt);
615 DATEFMT: for my $pat ("%Y-%m-%dT%TZ", # 2010-07-07T14:01:40Z
616 "%a, %d %b %Y %T %z", # Sun, 28 Sep 2008 12:23:12 +0100
617 "%b %d, %Y %R", # July 10,...
618 "%b %d, %Y %R", # July 4,...
620 $dt = eval {
621 my $p = DateTime::Format::Strptime->new
623 locale => "en",
624 time_zone => "UTC",
625 pattern => $pat,
627 $p->parse_datetime($date)
629 last DATEFMT if $dt;
631 unless ($dt) {
632 warn "Could not parse date[$date], setting to epoch 0";
633 $dt = DateTime->from_epoch( epoch => 0 );
635 $extract{"meta:date"} = $dt->datetime;
637 $extract{"meta:date"} =~ s/\.$// if $extract{"meta:date"};
639 unless ($extract{"meta:writer"}) {
640 for ("$previous_line[-1] $_") {
641 if (0) {
642 } elsif (/CPANPLUS, version (\S+)/) {
643 $extract{"meta:writer"} = "CPANPLUS $1";
644 } elsif (/created (?:automatically )?by (\S+)/) {
645 $extract{"meta:writer"} = $1;
646 if (/\s+on\s+perl\s+([^,]+),/) {
647 $fallback_p5 = $1;
649 } elsif (/This report was machine-generated by (\S+) (\S+)/) {
650 $extract{"meta:writer"} = "$1 $2";
652 $extract{"meta:writer"} =~ s/[\.,]$// if $extract{"meta:writer"};
655 if ($in_summary) {
656 # we do that first three lines a bit too often
657 my $qr = $Opt{dumpvars} || "";
658 $qr = qr/$qr/ if $qr;
659 unless (@q) {
660 @q = @{$Opt{q}||[]};
661 @q = qw(meta:perl conf:archname conf:usethreads conf:optimize meta:writer meta:from) unless @q;
664 my %conf_vars = map {($_ => 1)} grep { /^conf:/ } @q;
666 if (/^\s+Platform:$/) {
667 $in_summary_seen_platform=1;
668 } elsif (/^\s*$/ || m|</pre>|) {
669 # if not html, we have reached the end now
670 if ($in_summary_seen_platform) {
671 # some perls have an empty line after the summary line
672 $in_summary = 0;
674 } else {
675 my(%kv) = /\G,?\s*([^=]+)=('[^']+?'|\S+)/gc;
676 while (my($k,$v) = each %kv) {
677 my $ck = "conf:$k";
678 $ck =~ s/\s+$//;
679 $v =~ s/,$//;
680 if ($v =~ /^'(.*)'$/) {
681 $v = $1;
683 $v =~ s/^\s+//;
684 $v =~ s/\s+$//;
685 if ($qr && $ck =~ $qr) {
686 $extract{$ck} = $v;
687 } elsif ($conf_vars{$ck}) {
688 $extract{$ck} = $v;
693 if ($in_prg_output) {
694 unless ($extract{"meta:output_from"}) {
695 if (/Output from (.+):$/) {
696 $extract{"meta:output_from"} = $1
700 if ($in_env_context) {
701 if (/^\s{4}(\S+)\s*=\s*(.*)$/) {
702 $extract{"env:$1"} = $2;
705 push @previous_line, $_;
706 if ($expect_prereq || $expect_toolchain) {
707 if (/Perl module toolchain versions installed/) {
708 # first time discovered in CPANPLUS 0.89_06
709 $expecting_toolchain_soon = 1;
710 $expect_prereq=0;
711 next LINE;
713 if (exists $moduleunpack->{type}) {
714 my($module,$v,$needwant);
715 # type 1 and 2 are about prereqs, type three about toolchain
716 if ($moduleunpack->{type} == 1) {
717 (my $leader,$module,$needwant,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
718 next LINE if $@;
719 if ($leader =~ /^-/) {
720 $moduleunpack = {};
721 $expect_prereq = 0;
722 next LINE;
723 } elsif ($leader =~ /^(
724 buil # build_requires:
725 |conf # configure_requires:
726 )/x) {
727 next LINE;
728 } elsif ($module =~ /^(
729 - # line drawing
730 )/x) {
731 next LINE;
733 } elsif ($moduleunpack->{type} == 2) {
734 (my $leader,$module,$v,$needwant) = eval { unpack $moduleunpack->{tpl}, $_; };
735 next LINE if $@;
736 for ($module,$v,$needwant) {
737 s/^\s+//;
738 s/\s+$//;
740 if ($leader =~ /^\*/) {
741 $moduleunpack = {};
742 $expect_prereq = 0;
743 next LINE;
744 } elsif (!defined $v
745 or !defined $needwant
746 or $v =~ /\s/
747 or $needwant =~ /\s/
749 ($module,$v,$needwant) = split " ", $_;
751 } elsif ($moduleunpack->{type} == 3) {
752 (my $leader,$module,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
753 next LINE if $@;
754 if (!$module) {
755 $moduleunpack = {};
756 $expect_toolchain = 0;
757 next LINE;
758 } elsif ($module =~ /^-/) {
759 next LINE;
762 $module =~ s/\s+$//;
763 if ($module) {
764 $v =~ s/^\s+//;
765 $v =~ s/\s+$//;
766 my($modulename,$versionlead) = split " ", $module;
767 if (defined $modulename and defined $versionlead) {
768 $module = $modulename;
769 $v = "$versionlead$v";
771 if ($v eq "Have") {
772 next LINE;
774 $extract{"mod:$module"} = $v;
775 if ($needwant) {
776 $needwant =~ s/^\s+//;
777 $needwant =~ s/\s+$//;
778 $extract{"prereq:$module"} = $needwant;
782 if (/(\s+)(Module\s+)(Need\s+)Have/) {
783 $in_env_context = 0;
784 $moduleunpack = {
785 tpl => 'a'.length($1).'a'.length($2).'a'.length($3).'a*',
786 type => 1,
788 } elsif (/(\s+)(Module Name\s+)(Have)(\s+)Want/) {
789 $in_env_context = 0;
790 my $adjust_1 = 0;
791 my $adjust_2 = -length($4);
792 my $adjust_3 = length($4);
793 # I think they do not really try to align, usually we
794 # get away with split
795 $moduleunpack = {
796 tpl => 'a'.length($1).'a'.(length($2)+$adjust_2).'a'.(length($3)+$adjust_3).'a*',
797 type => 2,
801 if (/PREREQUISITES|Prerequisite modules loaded/) {
802 $in_env_context = 0;
803 $expect_prereq=1;
805 if ($expecting_toolchain_soon) {
806 if (/(\s+)(Module(?:\sName)?\s+) Have/) {
807 $in_env_context = 0;
808 $expect_toolchain=1;
809 $expecting_toolchain_soon=0;
810 $moduleunpack = {
811 tpl => 'a'.length($1).'a'.length($2).'a*',
812 type => 3,
816 if (/toolchain versions installed/) {
817 $in_env_context = 0;
818 $expecting_toolchain_soon=1;
820 } # LINE
821 if (! $extract{"meta:perl"} && $fallback_p5) {
822 my($p5,$patch) = split /\s+patch\s+/, $fallback_p5;
823 $extract{"meta:perl"} = $p5;
824 $extract{"conf:git_describe"} = $patch if defined $patch;
826 $extract{id} = $id;
827 if (my $filtercbbody = $Opt{filtercb}) {
828 my $filtercb = eval('sub {'.$filtercbbody.'}');
829 $filtercb->(\%extract);
831 if ($Opt{solve}) {
832 if ($extract{"conf:osvers"} && $extract{"conf:archname"}) {
833 $extract{"conf:archname+osvers"} = join " ", @extract{"conf:archname","conf:osvers"};
835 my $data = $dumpvars->{"==DATA=="} ||= [];
836 push @$data, \%extract;
838 # ---- %extract finished ----
839 my $diag = "";
840 if (my $qr = $Opt{dumpvars}) {
841 $qr = qr/$qr/;
842 while (my($k,$v) = each %extract) {
843 if ($k =~ $qr) {
844 $dumpvars->{$k}{$v}{$extract{"meta:ok"}}++;
848 for my $want (@q) {
849 my $have = $extract{$want} || "";
850 $diag .= " $want\[$have]";
852 printf STDERR " %-4s %8d%s\n", $extract{"meta:ok"}, $id, $diag unless $Opt{quiet};
853 if ($Opt{raw}) {
854 $report =~ s/\s+\z//;
855 print STDERR $report, "\n================\n" unless $Opt{quiet};
857 if ($Opt{interactive}) {
858 require IO::Prompt;
859 local @ARGV;
860 local $ARGV;
861 my $ans = IO::Prompt::prompt
863 -p => "View $id? [onechar: ynq] ",
864 -d => "y",
865 -u => qr/[ynq]/,
866 -onechar,
868 print STDERR "\n" unless $Opt{quiet};
869 if ($ans eq "y") {
870 my($report) = _get_cooked_report($target, \%Opt);
871 $Opt{pager} ||= "less";
872 open my $lfh, "|-", $Opt{pager} or die "Could not fork '$Opt{pager}': $!";
873 local $/;
874 print {$lfh} $report;
875 close $lfh or die "Could not close pager: $!"
876 } elsif ($ans eq "q") {
877 $Signal++;
878 return;
881 return \%extract;
884 sub _get_cooked_report {
885 my($target, $Opt_ref) = @_;
886 my($report, $isHTML);
887 if ($report = $Opt_ref->{article}) {
888 $isHTML = $report =~ /^</;
889 undef $target;
891 if ($target) {
892 my $fh;
893 if (0) {
894 } elsif (-e $target) {
895 open $fh, '<', $target or die "Could not open '$target': $!";
896 } elsif (-e "$target.gz") {
897 open $fh, "-|", "zcat", $target or die "Could not open '$target.gz': $!";
898 } else {
899 die "Could not find '$target' or '$target.gz'";
901 local $/;
902 my $raw_report = <$fh>;
903 $isHTML = $raw_report =~ /^</;
904 if ($isHTML) {
905 if ($raw_report =~ m{^<\?.+?<html.+?<head.+?<body.+?<pre[^>]*>(.+)</pre>.*</body>.*</html>}s) {
906 $raw_report = decode_entities($1);
907 $isHTML = 0;
910 if ($isHTML) {
911 $report = decode_entities($raw_report);
912 } elsif ($raw_report =~ /^MIME-Version: 1.0$/m
914 _looks_like_qp($raw_report)
916 # minimizing MIME effort; don't know about reports in other formats
917 $report = MIME::QuotedPrint::decode_qp($raw_report);
918 } else {
919 $report = $raw_report;
921 close $fh;
923 if ($report =~ /\r\n/) {
924 my @rlines = split /\r?\n/, $report;
925 $report = join "\n", @rlines;
927 ($report, $isHTML);
930 =head2 solve
932 Feeds a couple of potentially interesting data to
933 Statistics::Regression and sorts the result by R^2 descending. Do not
934 confuse this with a prove, rather take it as a useful hint. It can
935 save you minutes of staring at data and provide a quick overview where
936 one should look closer. Displays the N top candidates, where N
937 defaults to 3 and can be set with the C<$Opt{solvetop}> variable.
938 Regressions results with an R^2 of 1.00 are displayed in any case.
940 The function is called when the option C<-solve> is give on the
941 commandline. Several extra config variables are calculated, see source
942 code for details.
944 =cut
946 my %never_solve_on = map {($_ => 1)}
948 "conf:ccflags",
949 "conf:config_args",
950 "conf:cppflags",
951 "conf:lddlflags",
952 "conf:uname",
953 "env:PATH",
954 "env:PERL5LIB",
955 "env:PERL5OPT",
956 'env:$^X',
957 'env:$EGID',
958 'env:$GID',
959 'env:$UID/$EUID',
960 'env:PERL5_CPANPLUS_IS_RUNNING',
961 'env:PERL5_CPAN_IS_RUNNING',
962 'env:PERL5_CPAN_IS_RUNNING_IN_RECURSION',
963 'meta:ok',
965 my %normalize_numeric =
967 id => sub { return shift },
968 'meta:date' => sub {
969 my $v = shift;
970 my($Y,$M,$D,$h,$m,$s) = $v =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)/;
971 unless (defined $M) {
972 die "illegal value[$v] for a date";
974 Time::Local::timegm($s,$m,$h,$D,$M-1,$Y);
977 my %normalize_value =
979 'meta:perl' => sub {
980 my($perlatpatchlevel) = shift;
981 my $perl = $perlatpatchlevel;
982 $perl =~ s/\@.*//;
983 $perl;
986 sub solve {
987 my($V,%Opt) = @_;
988 require Statistics::Regression;
989 my @regression;
990 my $ycb;
991 if (my $ycbbody = $Opt{ycb}) {
992 $ycb = eval('sub {'.$ycbbody.'}');
993 die if $@;
994 } else {
995 $ycb = sub {
996 my $rec = shift;
997 my $y;
998 if ($rec->{"meta:ok"} eq "PASS") {
999 $y = 1;
1000 } elsif ($rec->{"meta:ok"} eq "FAIL") {
1001 $y = 0;
1003 return $y
1006 VAR: for my $variable (sort keys %$V) {
1007 next if $variable eq "==DATA==";
1008 if ($never_solve_on{$variable}){
1009 warn "Skipping '$variable'\n" unless $Opt{quiet};
1010 next VAR;
1012 my $value_distribution = $V->{$variable};
1013 my $keys = keys %$value_distribution;
1014 my @X = qw(const);
1015 if ($normalize_numeric{$variable}) {
1016 push @X, "n_$variable";
1017 } else {
1018 my %seen = ();
1019 for my $value (sort keys %$value_distribution) {
1020 my $pf = $value_distribution->{$value};
1021 $pf->{PASS} ||= 0;
1022 $pf->{FAIL} ||= 0;
1023 if ($pf->{PASS} || $pf->{FAIL}) {
1024 my $Xele = sprintf "eq_%s",
1026 $normalize_value{$variable} ?
1027 $normalize_value{$variable}->($value) :
1028 $value
1030 push @X, $Xele unless $seen{$Xele}++;
1033 if (
1034 $pf->{PASS} xor $pf->{FAIL}
1036 my $vl = 40;
1037 substr($value,$vl) = "..." if length $value > 3+$vl;
1038 my $poor_mans_freehand_estimation = 0;
1039 if ($poor_mans_freehand_estimation) {
1040 warn sprintf
1042 "%4d %4d %-23s | %s\n",
1043 $pf->{PASS},
1044 $pf->{FAIL},
1045 $variable,
1046 $value,
1052 warn "variable[$variable]keys[$keys]X[@X]\n" unless $Opt{quiet};
1053 next VAR unless @X > 1;
1054 my %regdata =
1056 X => \@X,
1057 data => [],
1059 RECORD: for my $rec (@{$V->{"==DATA=="}}) {
1060 my $y = $ycb->($rec);
1061 next RECORD unless defined $y;
1062 my %obs;
1063 $obs{Y} = $y;
1064 @obs{@X} = (0) x @X;
1065 $obs{const} = 1;
1066 for my $x (@X) {
1067 if ($x =~ /^eq_(.+)/) {
1068 my $read_v = $1;
1069 if (exists $rec->{$variable}
1070 && defined $rec->{$variable}
1072 my $use_v = (
1073 $normalize_value{$variable} ?
1074 $normalize_value{$variable}->($rec->{$variable}) :
1075 $rec->{$variable}
1077 if ($use_v eq $read_v) {
1078 $obs{$x} = 1;
1081 # warn "DEBUG: y[$y]x[$x]obs[$obs{$x}]\n";
1082 } elsif ($x =~ /^n_(.+)/) {
1083 my $v = $1;
1084 $obs{$x} = eval { $normalize_numeric{$v}->($rec->{$v}); };
1085 if ($@) {
1086 warn "Warning: error during parsing v[$v] in record[$rec->{id}]: $@; continuing with undef value";
1090 push @{$regdata{data}}, \%obs;
1092 _run_regression ($variable, \%regdata, \@regression, \%Opt);
1094 my $top = min ($Opt{solvetop} || 3, scalar @regression);
1095 my $max_rsq = sum map {1==$_->rsq ? 1 : 0} @regression;
1096 $top = $max_rsq if $max_rsq && $max_rsq > $top;
1097 my $score = 0;
1098 printf
1100 "State after regression testing: %d results, showing top %d\n\n",
1101 scalar @regression,
1102 $top,
1104 for my $reg (sort {
1105 $b->rsq <=> $a->rsq
1107 $a->k <=> $b->k
1108 } @regression) {
1109 printf "(%d)\n", ++$score;
1110 eval { $reg->print; };
1111 if ($@) {
1112 printf "\n\nOops, Statistics::Regression died during ->print() with error message[$@]\n\n";
1114 last if --$top <= 0;
1119 # $variable is the name we pass through to S:R constructor
1120 # $regdata is hash and has the arrays "X" and "data" (observations)
1121 # X goes to S:R constructor
1122 # each observation has a Y which we pass to S:R in an include() call
1123 # $regression is the collector array of results
1124 # $opt are the options from outside, used to see if we are "verbose"
1125 sub _run_regression {
1126 my($variable,$regdata,$regression,$opt) = @_;
1127 my @X = @{$regdata->{X}};
1128 # my $splo = $regdata->{"spliced-out"} = []; # maybe can be used to
1129 # hold the reference
1130 # group
1131 while (@X > 1) {
1132 my $reg = Statistics::Regression->new($variable,\@X);
1133 for my $obs (@{$regdata->{data}}) {
1134 my $y = delete $obs->{Y};
1135 $reg->include($y, $obs);
1136 $obs->{Y} = $y;
1138 eval {$reg->theta;
1139 my @e = $reg->standarderrors;
1140 die "found standarderrors == 0" if grep { 0 == $_ } @e;
1141 $reg->rsq;};
1142 if ($@) {
1143 if ($opt->{verbose} && $opt->{verbose}>=2) {
1144 require YAML::Syck;
1145 warn YAML::Syck::Dump
1146 ({error=>"could not determine some regression parameters",
1147 variable=>$variable,
1148 k=>$reg->k,
1149 n=>$reg->n,
1150 X=>$regdata->{"X"},
1151 errorstr => $@,
1154 # reduce k in case that linear dependencies disturbed us;
1155 # often called reference group; I'm tempted to collect and
1156 # make visible
1157 splice @X, 1, 1;
1158 } else {
1159 # $reg->print;
1160 push @$regression, $reg;
1161 return;
1166 =head1 AUTHOR
1168 Andreas König
1170 =head1 BUGS
1172 Please report any bugs or feature requests through the web
1173 interface at
1174 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CPAN-Testers-ParseReport>.
1175 I will be notified, and then you'll automatically be notified of
1176 progress on your bug as I make changes.
1178 =head1 SUPPORT
1180 You can find documentation for this module with the perldoc command.
1182 perldoc CPAN::Testers::ParseReport
1185 You can also look for information at:
1187 =over 4
1189 =item * RT: CPAN's request tracker
1191 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=CPAN-Testers-ParseReport>
1193 =item * AnnoCPAN: Annotated CPAN documentation
1195 L<http://annocpan.org/dist/CPAN-Testers-ParseReport>
1197 =item * CPAN Ratings
1199 L<http://cpanratings.perl.org/d/CPAN-Testers-ParseReport>
1201 =item * Search CPAN
1203 L<http://search.cpan.org/dist/CPAN-Testers-ParseReport>
1205 =back
1208 =head1 ACKNOWLEDGEMENTS
1210 Thanks to RJBS for module-starter.
1212 =head1 COPYRIGHT & LICENSE
1214 Copyright 2008 Andreas König.
1216 This program is free software; you can redistribute it and/or modify it
1217 under the same terms as Perl itself.
1220 =cut
1222 1; # End of CPAN::Testers::ParseReport