lib/CPAN/Testers/ParseReport.pm

   1 package CPAN::Testers::ParseReport;
   2
   3 use warnings;
   4 use strict;
   5
   6 use Config::Perl::V ();
   7 use DateTime::Format::Strptime;
   8 use File::Basename qw(basename);
   9 use File::Path qw(mkpath);
  10 use HTML::Entities qw(decode_entities);
  11 use LWP::UserAgent;
  12 use List::Util qw(max min sum);
  13 use MIME::QuotedPrint ();
  14 use Net::NNTP ();
  15 use Time::Local ();
  16 use XML::LibXML;
  17 use XML::LibXML::XPathContext;
  18
  19 our $default_ctformat = "yaml";
  20 our $default_transport = "nntp";
  21 our $default_cturl = "http://www.cpantesters.org/show";
  22 our $Signal = 0;
  23
  24 =encoding utf-8
  25
  26 =head1 NAME
  27
  28 CPAN::Testers::ParseReport - parse reports to www.cpantesters.org from various sources
  29
  30 =cut
  31
  32 use version; our $VERSION = qv('0.1.10');
  33
  34 =head1 SYNOPSIS
  35
  36 The documentation in here is normally not needed because the code is
  37 meant to be run from a standalone program, L<ctgetreports>.
  38
  39   ctgetreports --q mod:Moose Devel-Events
  40
  41 =head1 DESCRIPTION
  42
  43 This is the core module for CPAN::Testers::ParseReport. If you're not
  44 looking to extend or alter the behaviour of this module, you probably
  45 want to look at L<ctgetreports> instead.
  46
  47 =head1 OPTIONS
  48
  49 Are described in the L<ctgetreports> manpage and are passed through to
  50 the functions unaltered.
  51
  52 =head1 FUNCTIONS
  53
  54 =head2 parse_distro($distro,%options)
  55
  56 reads the cpantesters HTML page or the YAML file or the local database
  57 for the distro and loops through the reports for the specified or most
  58 recent version of that distro found in these data.
  59
  60 parse_distro() intentionally has no meaningful return value, different
  61 options would require different ones.
  62
  63 =head2 $extract = parse_single_report($report,$dumpvars,%options)
  64
  65 mirrors and reads this report. $report is of the form
  66
  67   { id => number }
  68
  69 $dumpvar is a hashreference that gets filled with data.
  70
  71 $extract is the result of parse_report() described below.
  72
  73 =cut
  74
  75 {
  76     my $ua;
  77     sub _ua {
  78         return $ua if $ua;
  79         $ua = LWP::UserAgent->new
  80             (
  81              keep_alive => 1,
  82              env_proxy => 1,
  83             );
  84         $ua->parse_head(0);
  85         $ua;
  86     }
  87 }
  88
  89 {
  90     my $nntp;
  91     sub _nntp {
  92         return $nntp if $nntp;
  93         $nntp = Net::NNTP->new("nntp.perl.org");
  94         $nntp->group("perl.cpan.testers");
  95         return $nntp;
  96     }
  97 }
  98
  99 {
 100     my $xp;
 101     sub _xp {
 102         return $xp if $xp;
 103         $xp = XML::LibXML->new;
 104         $xp->keep_blanks(0);
 105         $xp->clean_namespaces(1);
 106         my $catalog = __FILE__;
 107         $catalog =~ s|ParseReport.pm$|ParseReport/catalog|;
 108         $xp->load_catalog($catalog);
 109         return $xp;
 110     }
 111 }
 112
 113 sub _download_overview {
 114     my($cts_dir, $distro, %Opt) = @_;
 115     my $format = $Opt{ctformat} ||= $default_ctformat;
 116     my $cturl = $Opt{cturl} ||= $default_cturl;
 117     my $ctarget = "$cts_dir/$distro.$format";
 118     my $cheaders = "$cts_dir/$distro.headers";
 119     if ($Opt{local}) {
 120         unless (-e $ctarget) {
 121             die "Alert: No local file '$ctarget' found, cannot continue\n";
 122         }
 123     } else {
 124         if (! -e $ctarget or -M $ctarget > .25) {
 125             if (-e $ctarget && $Opt{verbose}) {
 126                 my(@stat) = stat _;
 127                 my $timestamp = gmtime $stat[9];
 128                 print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
 129             }
 130             print STDERR "Fetching $ctarget..." if $Opt{verbose} && !$Opt{quiet};
 131             my $uri = "$cturl/$distro.$format";
 132             my $resp = _ua->mirror($uri,$ctarget);
 133             if ($resp->is_success) {
 134                 print STDERR "DONE\n" if $Opt{verbose} && !$Opt{quiet};
 135                 open my $fh, ">", $cheaders or die;
 136                 for ($resp->headers->as_string) {
 137                     print $fh $_;
 138                     if ($Opt{verbose} && $Opt{verbose}>1) {
 139                         print STDERR $_ unless $Opt{quiet};
 140                     }
 141                 }
 142             } elsif (304 == $resp->code) {
 143                 print STDERR "DONE (not modified)\n" if $Opt{verbose} && !$Opt{quiet};
 144                 my $atime = my $mtime = time;
 145                 utime $atime, $mtime, $cheaders;
 146             } else {
 147                 die sprintf
 148                     (
 149                      "No success downloading %s: %s",
 150                      $uri,
 151                      $resp->status_line,
 152                     );
 153             }
 154         }
 155     }
 156     return $ctarget;
 157 }
 158
 159 sub _parse_html {
 160     my($ctarget, %Opt) = @_;
 161     my $content = do { open my $fh, $ctarget or die; local $/; <$fh> };
 162     my $preprocesswithtreebuilder = 0; # not needed since barbie switched to XHTML
 163     if ($preprocesswithtreebuilder) {
 164         require HTML::TreeBuilder;
 165         my $tree = HTML::TreeBuilder->new;
 166         $tree->implicit_tags(1);
 167         $tree->p_strict(1);
 168         $tree->ignore_ignorable_whitespace(0);
 169         $tree->parse_content($content);
 170         $tree->eof;
 171         $content = $tree->as_XML;
 172     }
 173     my $parser = _xp();
 174     my $doc = eval { $parser->parse_string($content) };
 175     my $err = $@;
 176     unless ($doc) {
 177         my $distro = basename $ctarget;
 178         die sprintf "Error while parsing %s\: %s", $distro, $err;
 179     }
 180     my $xc = XML::LibXML::XPathContext->new($doc);
 181     my $nsu = $doc->documentElement->namespaceURI;
 182     $xc->registerNs('x', $nsu) if $nsu;
 183     my($selected_release_ul,$selected_release_distrov,$excuse_string);
 184     my($cparentdiv)
 185         = $nsu ?
 186             $xc->findnodes("/x:html/x:body/x:div[\@id = 'doc']") :
 187                 $doc->findnodes("/html/body/div[\@id = 'doc']");
 188     my(@releasedivs) = $nsu ?
 189         $xc->findnodes("//x:div[x:h2 and x:ul]",$cparentdiv) :
 190             $cparentdiv->findnodes("//div[h2 and ul]");
 191     my $releasediv;
 192     if ($Opt{vdistro}) {
 193         $excuse_string = "selected distro '$Opt{vdistro}'";
 194         my($fallbacktoversion) = $Opt{vdistro} =~ /(\d+\..*)/;
 195         $fallbacktoversion = 0 unless defined $fallbacktoversion;
 196       RELEASE: for my $i (0..$#releasedivs) {
 197             my $picked = "";
 198             my($x) = $nsu ?
 199                 $xc->findvalue("x:h2/x:a[2]/\@name",$releasedivs[$i]) :
 200                     $releasedivs[$i]->findvalue("h2/a[2]/\@name");
 201             if ($x) {
 202                 if ($x eq $Opt{vdistro}) {
 203                     $releasediv = $i;
 204                     $picked = " (picked)";
 205                 }
 206                 print STDERR "FOUND DISTRO: $x$picked\n" unless $Opt{quiet};
 207             } else {
 208                 ($x) = $nsu ?
 209                     $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$i]) :
 210                         $releasedivs[$i]->findvalue("h2/a[1]/\@name");
 211                 if ($x eq $fallbacktoversion) {
 212                     $releasediv = $i;
 213                     $picked = " (picked)";
 214                 }
 215                 print STDERR "FOUND VERSION: $x$picked\n" unless $Opt{quiet};
 216             }
 217         }
 218     } else {
 219         $excuse_string = "any distro";
 220     }
 221     unless (defined $releasediv) {
 222         $releasediv = 0;
 223     }
 224     # using a[1] because a[2] is missing on the first entry
 225     ($selected_release_distrov) = $nsu ?
 226         $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$releasediv]) :
 227             $releasedivs[$releasediv]->findvalue("h2/a[1]/\@name");
 228     ($selected_release_ul) = $nsu ?
 229         $xc->findnodes("x:ul",$releasedivs[$releasediv]) :
 230             $releasedivs[$releasediv]->findnodes("ul");
 231     unless (defined $selected_release_distrov) {
 232         warn "Warning: could not find $excuse_string in '$ctarget'";
 233         return;
 234     }
 235     print STDERR "SELECTED: $selected_release_distrov\n" unless $Opt{quiet};
 236     my($id);
 237     my @all;
 238     for my $test ($nsu ?
 239                   $xc->findnodes("x:li",$selected_release_ul) :
 240                   $selected_release_ul->findnodes("li")) {
 241         $id = $nsu ?
 242             $xc->findvalue("x:a[1]/text()",$test)     :
 243                 $test->findvalue("a[1]/text()");
 244         push @all, {id=>$id};
 245         return if $Signal;
 246     }
 247     return \@all;
 248 }
 249
 250 sub _parse_yaml {
 251     my($ctarget, %Opt) = @_;
 252     require YAML::Syck;
 253     my $arr = YAML::Syck::LoadFile($ctarget);
 254     my($selected_release_ul,$selected_release_distrov,$excuse_string);
 255     if ($Opt{vdistro}) {
 256         $excuse_string = "selected distro '$Opt{vdistro}'";
 257         $arr = [grep {$_->{distversion} eq $Opt{vdistro}} @$arr];
 258         ($selected_release_distrov) = $arr->[0]{distversion};
 259     } else {
 260         $excuse_string = "any distro";
 261         my $last_addition;
 262         my %seen;
 263         for my $report (sort { $a->{id} <=> $b->{id} } @$arr) {
 264             unless ($seen{$report->{distversion}}++) {
 265                 $last_addition = $report->{distversion};
 266             }
 267         }
 268         $arr = [grep {$_->{distversion} eq $last_addition} @$arr];
 269         ($selected_release_distrov) = $last_addition;
 270     }
 271     unless ($selected_release_distrov) {
 272         warn "Warning: could not find $excuse_string in '$ctarget'";
 273         return;
 274     }
 275     print STDERR "SELECTED: $selected_release_distrov\n" unless $Opt{quiet};
 276     my @all;
 277     for my $test (@$arr) {
 278         my $id = $test->{id};
 279         push @all, {id=>$id};
 280         return if $Signal;
 281     }
 282     @all = sort { $b->{id} <=> $a->{id} } @all;
 283     return \@all;
 284 }
 285
 286 sub parse_single_report {
 287     my($report, $dumpvars, %Opt) = @_;
 288     my($id) = $report->{id};
 289     $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
 290     my $nnt_dir = "$Opt{cachedir}/nntp-testers";
 291     mkpath $nnt_dir;
 292     my $target = "$nnt_dir/$id";
 293     if ($Opt{local}) {
 294         unless (-e $target) {
 295             die {severity=>0,text=>"Warning: No local file '$target' found, skipping\n"};
 296         }
 297     } else {
 298         if (! -e $target) {
 299             print STDERR "Fetching $target..." if $Opt{verbose} && !$Opt{quiet};
 300             $Opt{transport} ||= $default_transport;
 301             if ($Opt{transport} eq "nntp") {
 302                 my $article = _nntp->article($id);
 303                 unless ($article) {
 304                     die {severity=>0,text=>"NNTP-Server did not return an article for id[$id]"};
 305                 }
 306                 open my $fh, ">", $target or die {severity=>1,text=>"Could not open >$target: $!"};
 307                 print $fh @$article;
 308             } elsif ($Opt{transport} eq "http") {
 309                 my $resp = _ua->mirror("http://www.nntp.perl.org/group/perl.cpan.testers/$id",$target);
 310                 if ($resp->is_success) {
 311                     if ($Opt{verbose}) {
 312                         my(@stat) = stat $target;
 313                         my $timestamp = gmtime $stat[9];
 314                         print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
 315                         if ($Opt{verbose} > 1) {
 316                             print STDERR $resp->headers->as_string unless $Opt{quiet};
 317                         }
 318                     }
 319                     my $headers = "$target.headers";
 320                     open my $fh, ">", $headers or die {severity=>1,text=>"Could not open >$headers: $!"};
 321                     print $fh $resp->headers->as_string;
 322                 } else {
 323                     die {severity=>0,
 324                              text=>sprintf "HTTP Server Error[%s] for id[%s]", $resp->status_line, $id};
 325                 }
 326             } else {
 327                 die {severity=>1,text=>"Illegal value for --transport: '$Opt{transport}'"};
 328             }
 329         }
 330     }
 331     parse_report($target, $dumpvars, %Opt);
 332 }
 333
 334 sub parse_distro {
 335     my($distro,%Opt) = @_;
 336     my %dumpvars;
 337     $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
 338     my $cts_dir = "$Opt{cachedir}/cpantesters-show";
 339     mkpath $cts_dir;
 340     if ($Opt{solve}) {
 341         require Statistics::Regression;
 342         $Opt{dumpvars} = "." unless defined $Opt{dumpvars};
 343     }
 344     if (!$Opt{vdistro} && $distro =~ /^(.+)-(?i:v?\d+)(?:\.\d+)*\w*$/) {
 345         $Opt{vdistro} = $distro;
 346         $distro = $1;
 347     }
 348     my $reports;
 349     if (my $ctdb = $Opt{ctdb}) {
 350         require CPAN::WWW::Testers::Generator::Database;
 351         require CPAN::DistnameInfo;
 352         my $dbi = CPAN::WWW::Testers::Generator::Database->new(database=>$ctdb) or die "Alert: unknown error while opening database '$ctdb'";
 353         unless ($Opt{vdistro}) {
 354             my $sql = "select version from cpanstats where dist=? order by id";
 355             my @rows = $dbi->get_query($sql,$distro);
 356             my($newest,%seen);
 357             for my $row (@rows) {
 358                 $newest = $row->[0] unless $seen{$row->[0]}++;
 359             }
 360             $Opt{vdistro} = "$distro-$newest";
 361         }
 362         my $d = CPAN::DistnameInfo->new("FOO/$Opt{vdistro}.tgz");
 363         my $dist = $d->dist;
 364         my $version = $d->version;
 365         my $sql = "select id from cpanstats where dist=? and version=? order by id desc";
 366         my @rows = $dbi->get_query($sql,$dist,$version);
 367         my @all;
 368         for my $row (@rows) {
 369             my $id = $row->[0];
 370             push @all, {id=>$id};
 371         }
 372         $reports = \@all;
 373     } else {
 374         my $ctarget = _download_overview($cts_dir, $distro, %Opt);
 375         $Opt{ctformat} ||= $default_ctformat;
 376         if ($Opt{ctformat} eq "html") {
 377             $reports = _parse_html($ctarget,%Opt);
 378         } else {
 379             $reports = _parse_yaml($ctarget,%Opt);
 380         }
 381     }
 382     return unless $reports;
 383     for my $report (@$reports) {
 384         eval {parse_single_report($report, \%dumpvars, %Opt)};
 385         if ($@) {
 386             if (ref $@) {
 387                 if ($@->{severity}) {
 388                     die $@->{text};
 389                 } else {
 390                     warn $@->{text};
 391                 }
 392             } else {
 393                 die $@;
 394             }
 395         }
 396         last if $Signal;
 397     }
 398     if ($Opt{dumpvars}) {
 399         require YAML::Syck;
 400         my $dumpfile = $Opt{dumpfile} || "ctgetreports.out";
 401         open my $fh, ">", $dumpfile or die "Could not open '$dumpfile' for writing: $!";
 402         print $fh YAML::Syck::Dump(\%dumpvars);
 403         close $fh or die "Could not close '$dumpfile': $!"
 404     }
 405     if ($Opt{solve}) {
 406         solve(\%dumpvars,%Opt);
 407     }
 408 }
 409
 410 =head2 $bool = _looks_like_qp($raw_report)
 411
 412 We had to acknowledge the fact that some MTAs swallow the MIME-Version
 413 header while passing MIME through. So we introduce fallback heuristics
 414 that try to determine if a report is written in quoted printable.
 415
 416 Note that this subroutine is internal, just documented to have the
 417 internals documented.
 418
 419 The current implementation counts the number of QP escaped spaces and
 420 equal signs.
 421
 422 =cut
 423
 424 sub _looks_like_qp {
 425     my($report) = @_;
 426     my $count_space = () = $report =~ /=20/g;
 427     return 1 if $count_space > 12;
 428     my $count_equal = () = $report =~ /=3D/g;
 429     return 1 if $count_equal > 12;
 430     return 1 if $count_space+$count_equal > 24;
 431     return 0; # waiting for a counter example
 432 }
 433
 434 =head2 $extract = parse_report($target,$dumpvars,%Opt)
 435
 436 Reads one report. $target is the local filename to read. $dumpvars is
 437 a hashref which gets filled with descriptive stats about
 438 PASS/FAIL/etc. %Opt are the options as described in the
 439 C<ctgetreports> manpage. $extract is a hashref containing the found
 440 variables.
 441
 442 Note: this parsing is a bit dirty but as it seems good enough I'm not
 443 inclined to change it. We parse HTML with regexps only, not an HTML
 444 parser. Only the entities are decoded.
 445
 446 Update around version 0.0.17: switching to nntp now but keeping the
 447 parser for HTML around to read old local copies.
 448
 449 Update around 0.0.18: In %Options you can use
 450
 451     article => $some_full_article_as_scalar
 452
 453 to use this function to parse one full article as text. When this is
 454 given, the argument $target is not read, but its basename is taken to
 455 be the id of the article. (OMG, hackers!)
 456
 457 =cut
 458 sub parse_report {
 459     my($target,$dumpvars,%Opt) = @_;
 460     our @q;
 461     my $id = basename($target);
 462     # warn "DEBUG: id[$id]";
 463     my($ok,$about);
 464
 465     my(%extract);
 466
 467     my($report,$isHTML) = _get_cooked_report($target, \%Opt);
 468     my @qr = map /^qr:(.+)/, @{$Opt{q}};
 469     if ($Opt{raw} || @qr) {
 470         for my $qr (@qr) {
 471             my $cqr = eval "qr{$qr}";
 472             die "Could not compile regular expression '$qr': $@" if $@;
 473             my(@matches) = $report =~ $cqr;
 474             my $v;
 475             if (@matches) {
 476                 if (@matches==1) {
 477                     $v = $matches[0];
 478                 } else {
 479                     $v = join "", map {"($_)"} @matches;
 480                 }
 481             } else {
 482                 $v = "";
 483             }
 484             $extract{"qr:$qr"} = $v;
 485         }
 486     }
 487
 488     my $report_writer;
 489     my $moduleunpack = {};
 490     my $expect_prereq = 0;
 491     my $expect_toolchain = 0;
 492     my $expecting_toolchain_soon = 0;
 493     my $fallback_p5 = "";
 494
 495     my $in_summary = 0;
 496     my $in_summary_seen_platform = 0;
 497     my $in_prg_output = 0;
 498     my $in_env_context = 0;
 499
 500     my $current_headline;
 501     my @previous_line = ""; # so we can neutralize line breaks
 502     my @rlines = split /\r?\n/, $report;
 503  LINE: for (@rlines) {
 504         next LINE unless ($isHTML ? m/<title>(\S+)\s+(\S+)/ : m/^Subject:\s*(\S+)\s+(\S+)/);
 505         $ok = $1;
 506         $about = $2;
 507         $extract{"meta:ok"}    = $ok;
 508         $extract{"meta:about"} = $about;
 509         last;
 510     }
 511  LINE: while (@rlines) {
 512         $_ = shift @rlines;
 513         while (/!$/ and @rlines) {
 514             my $followupline = shift @rlines;
 515             $followupline =~ s/^\s+//; # remo leading space
 516             $_ .= $followupline;
 517         }
 518         if (/^--------/ && $previous_line[-2] && $previous_line[-2] =~ /^--------/) {
 519             $current_headline = $previous_line[-1];
 520             if ($current_headline =~ /PROGRAM OUTPUT/) {
 521                 $in_prg_output = 1;
 522             } else {
 523                 $in_prg_output = 0;
 524             }
 525             if ($current_headline =~ /ENVIRONMENT AND OTHER CONTEXT/) {
 526                 $in_env_context = 1;
 527             } else {
 528                 $in_env_context = 0;
 529             }
 530         }
 531         if ($extract{"meta:perl"}) {
 532             if (    $in_summary
 533                 and !$extract{"conf:git_commit_id"}
 534                 and /Commit id:\s*([[:xdigit:]]+)/) {
 535                 $extract{"conf:git_commit_id"} = $1;
 536             }
 537         } else {
 538             my $p5;
 539             if (0) {
 540             } elsif (/Summary of my perl5 \((.+)\) configuration:/) {
 541                 $p5 = $1;
 542                 $in_summary = 1;
 543                 $in_env_context = 0;
 544             }
 545             if ($p5) {
 546                 my($r,$v,$s,$p);
 547                 if (($r,$v,$s,$p) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+) patch (\S+)/) {
 548                     $r =~ s/\.0//; # 5.0 6 2!
 549                     $extract{"meta:perl"} = "$r.$v.$s\@$p";
 550                 } elsif (($r,$v,$s) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+)/) {
 551                     $r =~ s/\.0//;
 552                     $extract{"meta:perl"} = "$r.$v.$s";
 553                 } elsif (($r,$v,$s) = $p5 =~ /(\d+\S*) patchlevel (\S+) subversion (\S+)/) {
 554                     $r =~ s/\.0//;
 555                     $extract{"meta:perl"} = "$r.$v.$s";
 556                 } else {
 557                     $extract{"meta:perl"} = $p5;
 558                 }
 559             }
 560         }
 561         unless ($extract{"meta:from"}) {
 562             if (0) {
 563             } elsif ($isHTML ?
 564                      m|<div class="h_name">From:</div> <b>(.+?)</b><br/>| :
 565                      m|^From:\s*(.+)|
 566                     ) {
 567                 $extract{"meta:from"} = $1;
 568             }
 569             $extract{"meta:from"} =~ s/\.$// if $extract{"meta:from"};
 570         }
 571         unless ($extract{"meta:date"}) {
 572             if (0) {
 573             } elsif ($isHTML ?
 574                      m|<div class="h_name">Date:</div> (.+?)<br/>| :
 575                      m|^Date:\s*(.+)|
 576                     ) {
 577                 my $date = $1;
 578                 my($dt);
 579             DATEFMT: for my $pat ("%a, %d %b %Y %T %z", # Sun, 28 Sep 2008 12:23:12 +0100
 580                                   "%b %d, %Y %R", # July 10,...
 581                                   "%b  %d, %Y %R", # July  4,...
 582                                  ) {
 583                     $dt = eval {
 584                         my $p = DateTime::Format::Strptime->new
 585                             (
 586                              locale => "en",
 587                              time_zone => "UTC",
 588                              pattern => $pat,
 589                             );
 590                         $p->parse_datetime($date)
 591                     };
 592                     last DATEFMT if $dt;
 593                 }
 594                 unless ($dt) {
 595                     warn "Could not parse date[$date], setting to epoch 0";
 596                     $dt = DateTime->from_epoch( epoch => 0 );
 597                 }
 598                 $extract{"meta:date"} = $dt->datetime;
 599             }
 600             $extract{"meta:date"} =~ s/\.$// if $extract{"meta:date"};
 601         }
 602         unless ($extract{"meta:writer"}) {
 603             for ("$previous_line[-1] $_") {
 604                 if (0) {
 605                 } elsif (/CPANPLUS, version (\S+)/) {
 606                     $extract{"meta:writer"} = "CPANPLUS $1";
 607                 } elsif (/created (?:automatically )?by (\S+)/) {
 608                     $extract{"meta:writer"} = $1;
 609                     if (/\s+on\s+perl\s+([^,]+),/) {
 610                         $fallback_p5 = $1;
 611                     }
 612                 } elsif (/This report was machine-generated by (\S+) (\S+)/) {
 613                     $extract{"meta:writer"} = "$1 $2";
 614                 }
 615                 $extract{"meta:writer"} =~ s/[\.,]$// if $extract{"meta:writer"};
 616             }
 617         }
 618         if ($in_summary) {
 619             # we do that first three lines a bit too often
 620             my $qr = $Opt{dumpvars} || "";
 621             $qr = qr/$qr/ if $qr;
 622             unless (@q) {
 623                 @q = @{$Opt{q}||[]};
 624                 @q = qw(meta:perl conf:archname conf:usethreads conf:optimize meta:writer meta:from) unless @q;
 625             }
 626
 627             my %conf_vars = map {($_ => 1)} grep { /^conf:/ } @q;
 628
 629             if (/^\s+Platform:$/) {
 630                 $in_summary_seen_platform=1;
 631             } elsif (/^\s*$/ || m|</pre>|) {
 632                 # if not html, we have reached the end now
 633                 if ($in_summary_seen_platform) {
 634                     # some perls have an empty line after the summary line
 635                     $in_summary = 0;
 636                 }
 637             } else {
 638                 my(%kv) = /\G,?\s*([^=]+)=('[^']+?'|\S+)/gc;
 639                 while (my($k,$v) = each %kv) {
 640                     my $ck = "conf:$k";
 641                     $ck =~ s/\s+$//;
 642                     $v =~ s/,$//;
 643                     if ($v =~ /^'(.*)'$/) {
 644                         $v = $1;
 645                     }
 646                     $v =~ s/^\s+//;
 647                     $v =~ s/\s+$//;
 648                     if ($qr && $ck =~ $qr) {
 649                         $extract{$ck} = $v;
 650                     } elsif ($conf_vars{$ck}) {
 651                         $extract{$ck} = $v;
 652                     }
 653                 }
 654             }
 655         }
 656         if ($in_prg_output) {
 657             unless ($extract{"meta:output_from"}) {
 658                 if (/Output from (.+):$/) {
 659                     $extract{"meta:output_from"} = $1
 660                 }
 661             }
 662         }
 663         if ($in_env_context) {
 664             if (/^\s{4}(\S+)\s*=\s*(.*)$/) {
 665                 $extract{"env:$1"} = $2;
 666             }
 667         }
 668         push @previous_line, $_;
 669         if ($expect_prereq || $expect_toolchain) {
 670             if (/Perl module toolchain versions installed/) {
 671                 # first time discovered in CPANPLUS 0.89_06
 672                 $expecting_toolchain_soon = 1;
 673                 $expect_prereq=0;
 674                 next LINE;
 675             }
 676             if (exists $moduleunpack->{type}) {
 677                 my($module,$v,$needwant);
 678                 # type 1 and 2 are about prereqs, type three about toolchain
 679                 if ($moduleunpack->{type} == 1) {
 680                     (my $leader,$module,$needwant,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
 681                     next LINE if $@;
 682                     if ($leader =~ /^-/) {
 683                         $moduleunpack = {};
 684                         $expect_prereq = 0;
 685                         next LINE;
 686                     } elsif ($leader =~ /^(
 687                                          buil          # build_requires:
 688                                         )/x) {
 689                         next LINE;
 690                     } elsif ($module =~ /^(
 691                                          -             # line drawing
 692                                         )/x) {
 693                         next LINE;
 694                     }
 695                 } elsif ($moduleunpack->{type} == 2) {
 696                     (my $leader,$module,$v,$needwant) = eval { unpack $moduleunpack->{tpl}, $_; };
 697                     next LINE if $@;
 698                     if ($leader =~ /^\*/) {
 699                         $moduleunpack = {};
 700                         $expect_prereq = 0;
 701                         next LINE;
 702                     } elsif (!defined $v
 703                              or !defined $needwant
 704                              or $v =~ /\s/
 705                              or $needwant =~ /\s/
 706                             ) {
 707                         ($module,$v,$needwant) = split " ", $_;
 708                     }
 709                 } elsif ($moduleunpack->{type} == 3) {
 710                     (my $leader,$module,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
 711                     next LINE if $@;
 712                     if (!$module) {
 713                         $moduleunpack = {};
 714                         $expect_toolchain = 0;
 715                         next LINE;
 716                     } elsif ($module =~ /^-/) {
 717                         next LINE;
 718                     }
 719                 }
 720                 $module =~ s/\s+$//;
 721                 if ($module) {
 722                     $v =~ s/^\s+//;
 723                     $v =~ s/\s+$//;
 724                     my($modulename,$versionlead) = split " ", $module;
 725                     if (defined $modulename and defined $versionlead) {
 726                         $module = $modulename;
 727                         $v = "$versionlead$v";
 728                     }
 729                     if ($v eq "Have") {
 730                         next LINE;
 731                     }
 732                     $extract{"mod:$module"} = $v;
 733                     if ($needwant) {
 734                         $needwant =~ s/^\s+//;
 735                         $needwant =~ s/\s+$//;
 736                         $extract{"prereq:$module"} = $needwant;
 737                     }
 738                 }
 739             }
 740             if (/(\s+)(Module\s+)(Need\s+)Have/) {
 741                 $in_env_context = 0;
 742                 $moduleunpack = {
 743                                  tpl => 'a'.length($1).'a'.length($2).'a'.length($3).'a*',
 744                                  type => 1,
 745                                 };
 746             } elsif (/(\s+)(Module Name\s+)(Have)(\s+)Want/) {
 747                 $in_env_context = 0;
 748                 my $adjust_1 = 0;
 749                 my $adjust_2 = -length($4);
 750                 my $adjust_3 = length($4);
 751                 # I think they do not really try to align, usually we
 752                 # get away with split
 753                 $moduleunpack = {
 754                                  tpl => 'a'.length($1).'a'.(length($2)+$adjust_2).'a'.(length($3)+$adjust_3).'a*',
 755                                  type => 2,
 756                                 };
 757             }
 758         }
 759         if (/PREREQUISITES|Prerequisite modules loaded/) {
 760             $in_env_context = 0;
 761             $expect_prereq=1;
 762         }
 763         if ($expecting_toolchain_soon) {
 764             if (/(\s+)(Module(?:\sName)?\s+) Have/) {
 765                 $in_env_context = 0;
 766                 $expect_toolchain=1;
 767                 $expecting_toolchain_soon=0;
 768                 $moduleunpack = {
 769                                  tpl => 'a'.length($1).'a'.length($2).'a*',
 770                                  type => 3,
 771                                 };
 772             }
 773         }
 774         if (/toolchain versions installed/) {
 775             $in_env_context = 0;
 776             $expecting_toolchain_soon=1;
 777         }
 778     }                           # LINE
 779     if (! $extract{"meta:perl"} && $fallback_p5) {
 780         my($p5,$patch) = split /\s+patch\s+/, $fallback_p5;
 781         $extract{"meta:perl"} = $p5;
 782         $extract{"conf:git_describe"} = $patch;
 783     }
 784     if ($Opt{solve}) {
 785         $extract{id} = $id;
 786         if ($extract{"conf:osvers"} && $extract{"conf:archname"}) {
 787             $extract{"conf:archname+osvers"} = join " ", @extract{"conf:archname","conf:osvers"};
 788         }
 789         my $data = $dumpvars->{"==DATA=="} ||= [];
 790         push @$data, \%extract;
 791     }
 792     # ---- %extract finished ----
 793     my $diag = "";
 794     if (my $qr = $Opt{dumpvars}) {
 795         $qr = qr/$qr/;
 796         while (my($k,$v) = each %extract) {
 797             if ($k =~ $qr) {
 798                 $dumpvars->{$k}{$v}{$ok}++;
 799             }
 800         }
 801     }
 802     for my $want (@q) {
 803         my $have  = $extract{$want} || "";
 804         $diag .= " $want\[$have]";
 805     }
 806     printf STDERR " %-4s %8d%s\n", $ok, $id, $diag unless $Opt{quiet};
 807     if ($Opt{raw}) {
 808         $report =~ s/\s+\z//;
 809         print STDERR $report, "\n================\n" unless $Opt{quiet};
 810     }
 811     if ($Opt{interactive}) {
 812         require IO::Prompt;
 813         local @ARGV;
 814         local $ARGV;
 815         my $ans = IO::Prompt::prompt
 816             (
 817              -p => "View $id? [onechar: ynq] ",
 818              -d => "y",
 819              -u => qr/[ynq]/,
 820              -onechar,
 821             );
 822         print STDERR "\n" unless $Opt{quiet};
 823         if ($ans eq "y") {
 824             my($report) = _get_cooked_report($target, \%Opt);
 825             $Opt{pager} ||= "less";
 826             open my $lfh, "|-", $Opt{pager} or die "Could not fork '$Opt{pager}': $!";
 827             local $/;
 828             print {$lfh} $report;
 829             close $lfh or die "Could not close pager: $!"
 830         } elsif ($ans eq "q") {
 831             $Signal++;
 832             return;
 833         }
 834     }
 835     return \%extract;
 836 }
 837
 838 sub _get_cooked_report {
 839     my($target, $Opt_ref) = @_;
 840     my($report, $isHTML);
 841     if ($report = $Opt_ref->{article}) {
 842         $isHTML = $report =~ /^</;
 843         undef $target;
 844     }
 845     if ($target) {
 846         open my $fh, $target or die "Could not open '$target': $!";
 847         local $/;
 848         my $raw_report = <$fh>;
 849         $isHTML = $raw_report =~ /^</;
 850         if ($isHTML) {
 851             $report = decode_entities($raw_report);
 852         } elsif ($raw_report =~ /^MIME-Version: 1.0$/m
 853                  ||
 854                  _looks_like_qp($raw_report)
 855                 ) {
 856             # minimizing MIME effort; don't know about reports in other formats
 857             $report = MIME::QuotedPrint::decode_qp($raw_report);
 858         } else {
 859             $report = $raw_report;
 860         }
 861         close $fh;
 862     }
 863     if ($report =~ /\r\n/) {
 864         my @rlines = split /\r?\n/, $report;
 865         $report = join "\n", @rlines;
 866     }
 867     ($report, $isHTML);
 868 }
 869
 870 =head2 solve
 871
 872 Feeds a couple of potentially interesting data to
 873 Statistics::Regression and sorts the result by R^2 descending. Do not
 874 confuse this with a prove, rather take it as a useful hint. It can
 875 save you minutes of staring at data and provide a quick overview where
 876 one should look closer. Displays the N top candidates, where N
 877 defaults to 3 and can be set with the C<$Opt{solvetop}> variable.
 878 Regressions results with an R^2 of 1.00 are displayed in any case.
 879
 880 The function is called when the option C<-solve> is give on the
 881 commandline. Several extra config variables are calculated, see source
 882 code for details.
 883
 884 =cut
 885 {
 886     my %never_solve_on = map {($_ => 1)}
 887         (
 888          "conf:ccflags",
 889          "conf:config_args",
 890          "conf:cppflags",
 891          "conf:lddlflags",
 892          "conf:uname",
 893          "env:PATH",
 894          "env:PERL5LIB",
 895          "env:PERL5OPT",
 896          'env:$^X',
 897          'env:$EGID',
 898          'env:$GID',
 899          'env:$UID/$EUID',
 900          'env:PERL5_CPANPLUS_IS_RUNNING',
 901          'env:PERL5_CPAN_IS_RUNNING',
 902          'env:PERL5_CPAN_IS_RUNNING_IN_RECURSION',
 903          'meta:ok',
 904         );
 905     my %normalize_numeric =
 906         (
 907          id => sub { return shift },
 908          'meta:date' => sub {
 909              my $v = shift;
 910              my($Y,$M,$D,$h,$m,$s) = $v =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)/;
 911              unless (defined $M) {
 912                  die "illegal value[$v] for a date";
 913              }
 914              Time::Local::timegm($s,$m,$h,$D,$M-1,$Y);
 915          },
 916         );
 917     my %normalize_value =
 918         (
 919          'meta:perl' => sub {
 920              my($perlatpatchlevel) = shift;
 921              my $perl = $perlatpatchlevel;
 922              $perl =~ s/\@.*//;
 923              $perl;
 924          },
 925         );
 926 sub solve {
 927     my($V,%Opt) = @_;
 928     require Statistics::Regression;
 929     my @regression;
 930     my $ycb;
 931     if (my $ycbbody = $Opt{ycb}) {
 932         $ycb = eval('sub {'.$ycbbody.'}');
 933         die if $@;
 934     } else {
 935         $ycb = sub {
 936             my $rec = shift;
 937             my $y;
 938             if ($rec->{"meta:ok"} eq "PASS") {
 939                 $y = 1;
 940             } elsif ($rec->{"meta:ok"} eq "FAIL") {
 941                 $y = 0;
 942             }
 943             return $y
 944         };
 945     }
 946   VAR: for my $variable (sort keys %$V) {
 947         next if $variable eq "==DATA==";
 948         if ($never_solve_on{$variable}){
 949             warn "Skipping '$variable'\n" unless $Opt{quiet};
 950             next VAR;
 951         }
 952         my $value_distribution = $V->{$variable};
 953         my $keys = keys %$value_distribution;
 954         my @X = qw(const);
 955         if ($normalize_numeric{$variable}) {
 956             push @X, "n_$variable";
 957         } else {
 958             my %seen = ();
 959             for my $value (sort keys %$value_distribution) {
 960                 my $pf = $value_distribution->{$value};
 961                 $pf->{PASS} ||= 0;
 962                 $pf->{FAIL} ||= 0;
 963                 if ($pf->{PASS} || $pf->{FAIL}) {
 964                     my $Xele = sprintf "eq_%s",
 965                         (
 966                          $normalize_value{$variable} ?
 967                          $normalize_value{$variable}->($value) :
 968                          $value
 969                         );
 970                     push @X, $Xele unless $seen{$Xele}++;
 971
 972                 }
 973                 if (
 974                     $pf->{PASS} xor $pf->{FAIL}
 975                    ) {
 976                     my $vl = 40;
 977                     substr($value,$vl) = "..." if length $value > 3+$vl;
 978                     my $poor_mans_freehand_estimation = 0;
 979                     if ($poor_mans_freehand_estimation) {
 980                         warn sprintf
 981                             (
 982                              "%4d %4d %-23s | %s\n",
 983                              $pf->{PASS},
 984                              $pf->{FAIL},
 985                              $variable,
 986                              $value,
 987                             );
 988                     }
 989                 }
 990             }
 991         }
 992         warn "variable[$variable]keys[$keys]X[@X]\n" unless $Opt{quiet};
 993         next VAR unless @X > 1;
 994         my %regdata =
 995             (
 996              X => \@X,
 997              data => [],
 998             );
 999       RECORD: for my $rec (@{$V->{"==DATA=="}}) {
1000             my $y = $ycb->($rec);
1001             next RECORD unless defined $y;
1002             my %obs;
1003             $obs{Y} = $y;
1004             @obs{@X} = (0) x @X;
1005             $obs{const} = 1;
1006             for my $x (@X) {
1007                 if ($x =~ /^eq_(.+)/) {
1008                     my $read_v = $1;
1009                     if (exists $rec->{$variable}
1010                         && defined $rec->{$variable}
1011                        ) {
1012                         my $use_v = (
1013                                      $normalize_value{$variable} ?
1014                                      $normalize_value{$variable}->($rec->{$variable}) :
1015                                      $rec->{$variable}
1016                                     );
1017                         if ($use_v eq $read_v) {
1018                             $obs{$x} = 1;
1019                         }
1020                     }
1021                     # warn "DEBUG: y[$y]x[$x]obs[$obs{$x}]\n";
1022                 } elsif ($x =~ /^n_(.+)/) {
1023                     my $v = $1;
1024                     $obs{$x} = eval { $normalize_numeric{$v}->($rec->{$v}); };
1025                     if ($@) {
1026                         warn "Warning: error during parsing v[$v] in record[$rec->{id}]: $@; continuing with undef value";
1027                     }
1028                 }
1029             }
1030             push @{$regdata{data}}, \%obs;
1031         }
1032         _run_regression ($variable, \%regdata, \@regression, \%Opt);
1033     }
1034     my $top = min ($Opt{solvetop} || 3, scalar @regression);
1035     my $max_rsq = sum map {1==$_->rsq ? 1 : 0} @regression;
1036     $top = $max_rsq if $max_rsq && $max_rsq > $top;
1037     my $score = 0;
1038     printf
1039         (
1040          "State after regression testing: %d results, showing top %d\n\n",
1041          scalar @regression,
1042          $top,
1043         );
1044     for my $reg (sort {
1045                      $b->rsq <=> $a->rsq
1046                      ||
1047                      $a->k <=> $b->k
1048                  } @regression) {
1049         printf "(%d)\n", ++$score;
1050         eval { $reg->print; };
1051         if ($@) {
1052             printf "\n\nOops, Statistics::Regression died during ->print() with error message[$@]\n\n";
1053         }
1054         last if --$top <= 0;
1055     }
1056 }
1057 }
1058
1059 # $variable is the name we pass through to S:R constructor
1060 # $regdata is hash and has the arrays "X" and "data" (observations)
1061 # X goes to S:R constructor
1062 # each observation has a Y which we pass to S:R in an include() call
1063 # $regression is the collector array of results
1064 # $opt are the options from outside, used to see if we are "verbose"
1065 sub _run_regression {
1066     my($variable,$regdata,$regression,$opt) = @_;
1067     my @X = @{$regdata->{X}};
1068     # my $splo = $regdata->{"spliced-out"} = []; # maybe can be used to
1069                                                # hold the reference
1070                                                # group
1071     while (@X > 1) {
1072         my $reg = Statistics::Regression->new($variable,\@X);
1073         for my $obs (@{$regdata->{data}}) {
1074             my $y = delete $obs->{Y};
1075             $reg->include($y, $obs);
1076             $obs->{Y} = $y;
1077         }
1078         eval {$reg->theta;
1079               my @e = $reg->standarderrors;
1080               die "found standarderrors == 0" if grep { 0 == $_ } @e;
1081               $reg->rsq;};
1082         if ($@) {
1083             if ($opt->{verbose} && $opt->{verbose}>=2) {
1084                 require YAML::Syck;
1085                 warn YAML::Syck::Dump
1086                     ({error=>"could not determine some regression parameters",
1087                       variable=>$variable,
1088                       k=>$reg->k,
1089                       n=>$reg->n,
1090                       X=>$regdata->{"X"},
1091                       errorstr => $@,
1092                      });
1093             }
1094             # reduce k in case that linear dependencies disturbed us;
1095             # often called reference group; I'm tempted to collect and
1096             # make visible
1097             splice @X, 1, 1;
1098         } else {
1099             # $reg->print;
1100             push @$regression, $reg;
1101             return;
1102         }
1103     }
1104 }
1105
1106 =head1 AUTHOR
1107
1108 Andreas König
1109
1110 =head1 BUGS
1111
1112 Please report any bugs or feature requests through the web
1113 interface at
1114 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CPAN-Testers-ParseReport>.
1115 I will be notified, and then you'll automatically be notified of
1116 progress on your bug as I make changes.
1117
1118 =head1 SUPPORT
1119
1120 You can find documentation for this module with the perldoc command.
1121
1122     perldoc CPAN::Testers::ParseReport
1123
1124
1125 You can also look for information at:
1126
1127 =over 4
1128
1129 =item * RT: CPAN's request tracker
1130
1131 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=CPAN-Testers-ParseReport>
1132
1133 =item * AnnoCPAN: Annotated CPAN documentation
1134
1135 L<http://annocpan.org/dist/CPAN-Testers-ParseReport>
1136
1137 =item * CPAN Ratings
1138
1139 L<http://cpanratings.perl.org/d/CPAN-Testers-ParseReport>
1140
1141 =item * Search CPAN
1142
1143 L<http://search.cpan.org/dist/CPAN-Testers-ParseReport>
1144
1145 =back
1146
1147
1148 =head1 ACKNOWLEDGEMENTS
1149
1150 Thanks to RJBS for module-starter.
1151
1152 =head1 COPYRIGHT & LICENSE
1153
1154 Copyright 2008 Andreas König.
1155
1156 This program is free software; you can redistribute it and/or modify it
1157 under the same terms as Perl itself.
1158
1159
1160 =cut
1161
1162 1; # End of CPAN::Testers::ParseReport