lib/CPAN/Testers/ParseReport.pm

   1 package CPAN::Testers::ParseReport;
   2
   3 use warnings;
   4 use strict;
   5
   6 use Config::Perl::V ();
   7 use DateTime::Format::Strptime;
   8 use DateTime::Format::DateParse;
   9 use File::Basename qw(basename);
  10 use File::Path qw(mkpath);
  11 use HTML::Entities qw(decode_entities);
  12 use LWP::UserAgent;
  13 use List::Util qw(max min sum);
  14 use MIME::QuotedPrint ();
  15 use Net::NNTP ();
  16 use Time::Local ();
  17 use XML::LibXML;
  18 use XML::LibXML::XPathContext;
  19
  20 our $default_ctformat = "yaml";
  21 our $default_transport = "nntp";
  22 our $default_cturl = "http://www.cpantesters.org/show";
  23 our $Signal = 0;
  24
  25 =encoding utf-8
  26
  27 =head1 NAME
  28
  29 CPAN::Testers::ParseReport - parse reports to www.cpantesters.org from various sources
  30
  31 =cut
  32
  33 use version; our $VERSION = qv('0.1.7');
  34
  35 =head1 SYNOPSIS
  36
  37 The documentation in here is normally not needed because the code is
  38 meant to be run from a standalone program, L<ctgetreports>.
  39
  40   ctgetreports --q mod:Moose Devel-Events
  41
  42 =head1 DESCRIPTION
  43
  44 This is the core module for CPAN::Testers::ParseReport. If you're not
  45 looking to extend or alter the behaviour of this module, you probably
  46 want to look at L<ctgetreports> instead.
  47
  48 =head1 OPTIONS
  49
  50 Are described in the L<ctgetreports> manpage and are passed through to
  51 the functions unaltered.
  52
  53 =head1 FUNCTIONS
  54
  55 =head2 parse_distro($distro,%options)
  56
  57 reads the cpantesters HTML page or the YAML file or the local database
  58 for the distro and loops through the reports for the specified or most
  59 recent version of that distro found in these data.
  60
  61 parse_distro() intentionally has no meaningful return value, different
  62 options would require different ones.
  63
  64 =head2 $extract = parse_single_report($report,$dumpvars,%options)
  65
  66 mirrors and reads this report. $report is of the form
  67
  68   { id => number }
  69
  70 $dumpvar is a hashreference that gets filled with data.
  71
  72 $extract is the result of parse_report() described below.
  73
  74 =cut
  75
  76 {
  77     my $ua;
  78     sub _ua {
  79         return $ua if $ua;
  80         $ua = LWP::UserAgent->new
  81             (
  82              keep_alive => 1,
  83              env_proxy => 1,
  84             );
  85         $ua->parse_head(0);
  86         $ua;
  87     }
  88 }
  89
  90 {
  91     my $nntp;
  92     sub _nntp {
  93         return $nntp if $nntp;
  94         $nntp = Net::NNTP->new("nntp.perl.org");
  95         $nntp->group("perl.cpan.testers");
  96         return $nntp;
  97     }
  98 }
  99
 100 {
 101     my $xp;
 102     sub _xp {
 103         return $xp if $xp;
 104         $xp = XML::LibXML->new;
 105         $xp->keep_blanks(0);
 106         $xp->clean_namespaces(1);
 107         my $catalog = __FILE__;
 108         $catalog =~ s|ParseReport.pm$|ParseReport/catalog|;
 109         $xp->load_catalog($catalog);
 110         return $xp;
 111     }
 112 }
 113
 114 sub _download_overview {
 115     my($cts_dir, $distro, %Opt) = @_;
 116     my $format = $Opt{ctformat} ||= $default_ctformat;
 117     my $cturl = $Opt{cturl} ||= $default_cturl;
 118     my $ctarget = "$cts_dir/$distro.$format";
 119     my $cheaders = "$cts_dir/$distro.headers";
 120     if ($Opt{local}) {
 121         unless (-e $ctarget) {
 122             die "Alert: No local file '$ctarget' found, cannot continue\n";
 123         }
 124     } else {
 125         if (! -e $ctarget or -M $ctarget > .25) {
 126             if (-e $ctarget && $Opt{verbose}) {
 127                 my(@stat) = stat _;
 128                 my $timestamp = gmtime $stat[9];
 129                 print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
 130             }
 131             print STDERR "Fetching $ctarget..." if $Opt{verbose} && !$Opt{quiet};
 132             my $uri = "$cturl/$distro.$format";
 133             my $resp = _ua->mirror($uri,$ctarget);
 134             if ($resp->is_success) {
 135                 print STDERR "DONE\n" if $Opt{verbose} && !$Opt{quiet};
 136                 open my $fh, ">", $cheaders or die;
 137                 for ($resp->headers->as_string) {
 138                     print $fh $_;
 139                     if ($Opt{verbose} && $Opt{verbose}>1) {
 140                         print STDERR $_ unless $Opt{quiet};
 141                     }
 142                 }
 143             } elsif (304 == $resp->code) {
 144                 print STDERR "DONE (not modified)\n" if $Opt{verbose} && !$Opt{quiet};
 145                 my $atime = my $mtime = time;
 146                 utime $atime, $mtime, $cheaders;
 147             } else {
 148                 die sprintf
 149                     (
 150                      "No success downloading %s: %s",
 151                      $uri,
 152                      $resp->status_line,
 153                     );
 154             }
 155         }
 156     }
 157     return $ctarget;
 158 }
 159
 160 sub _parse_html {
 161     my($ctarget, %Opt) = @_;
 162     my $content = do { open my $fh, $ctarget or die; local $/; <$fh> };
 163     my $preprocesswithtreebuilder = 0; # not needed since barbie switched to XHTML
 164     if ($preprocesswithtreebuilder) {
 165         require HTML::TreeBuilder;
 166         my $tree = HTML::TreeBuilder->new;
 167         $tree->implicit_tags(1);
 168         $tree->p_strict(1);
 169         $tree->ignore_ignorable_whitespace(0);
 170         $tree->parse_content($content);
 171         $tree->eof;
 172         $content = $tree->as_XML;
 173     }
 174     my $parser = _xp();
 175     my $doc = eval { $parser->parse_string($content) };
 176     my $err = $@;
 177     unless ($doc) {
 178         my $distro = basename $ctarget;
 179         die sprintf "Error while parsing %s\: %s", $distro, $err;
 180     }
 181     my $xc = XML::LibXML::XPathContext->new($doc);
 182     my $nsu = $doc->documentElement->namespaceURI;
 183     $xc->registerNs('x', $nsu) if $nsu;
 184     my($selected_release_ul,$selected_release_distrov,$excuse_string);
 185     my($cparentdiv)
 186         = $nsu ?
 187             $xc->findnodes("/x:html/x:body/x:div[\@id = 'doc']") :
 188                 $doc->findnodes("/html/body/div[\@id = 'doc']");
 189     my(@releasedivs) = $nsu ?
 190         $xc->findnodes("//x:div[x:h2 and x:ul]",$cparentdiv) :
 191             $cparentdiv->findnodes("//div[h2 and ul]");
 192     my $releasediv;
 193     if ($Opt{vdistro}) {
 194         $excuse_string = "selected distro '$Opt{vdistro}'";
 195         my($fallbacktoversion) = $Opt{vdistro} =~ /(\d+\..*)/;
 196         $fallbacktoversion = 0 unless defined $fallbacktoversion;
 197       RELEASE: for my $i (0..$#releasedivs) {
 198             my $picked = "";
 199             my($x) = $nsu ?
 200                 $xc->findvalue("x:h2/x:a[2]/\@name",$releasedivs[$i]) :
 201                     $releasedivs[$i]->findvalue("h2/a[2]/\@name");
 202             if ($x) {
 203                 if ($x eq $Opt{vdistro}) {
 204                     $releasediv = $i;
 205                     $picked = " (picked)";
 206                 }
 207                 print STDERR "FOUND DISTRO: $x$picked\n" unless $Opt{quiet};
 208             } else {
 209                 ($x) = $nsu ?
 210                     $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$i]) :
 211                         $releasedivs[$i]->findvalue("h2/a[1]/\@name");
 212                 if ($x eq $fallbacktoversion) {
 213                     $releasediv = $i;
 214                     $picked = " (picked)";
 215                 }
 216                 print STDERR "FOUND VERSION: $x$picked\n" unless $Opt{quiet};
 217             }
 218         }
 219     } else {
 220         $excuse_string = "any distro";
 221     }
 222     unless (defined $releasediv) {
 223         $releasediv = 0;
 224     }
 225     # using a[1] because a[2] is missing on the first entry
 226     ($selected_release_distrov) = $nsu ?
 227         $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$releasediv]) :
 228             $releasedivs[$releasediv]->findvalue("h2/a[1]/\@name");
 229     ($selected_release_ul) = $nsu ?
 230         $xc->findnodes("x:ul",$releasedivs[$releasediv]) :
 231             $releasedivs[$releasediv]->findnodes("ul");
 232     unless (defined $selected_release_distrov) {
 233         warn "Warning: could not find $excuse_string in '$ctarget'";
 234         return;
 235     }
 236     print STDERR "SELECTED: $selected_release_distrov\n" unless $Opt{quiet};
 237     my($id);
 238     my @all;
 239     for my $test ($nsu ?
 240                   $xc->findnodes("x:li",$selected_release_ul) :
 241                   $selected_release_ul->findnodes("li")) {
 242         $id = $nsu ?
 243             $xc->findvalue("x:a[1]/text()",$test)     :
 244                 $test->findvalue("a[1]/text()");
 245         push @all, {id=>$id};
 246         return if $Signal;
 247     }
 248     return \@all;
 249 }
 250
 251 sub _parse_yaml {
 252     my($ctarget, %Opt) = @_;
 253     require YAML::Syck;
 254     my $arr = YAML::Syck::LoadFile($ctarget);
 255     my($selected_release_ul,$selected_release_distrov,$excuse_string);
 256     if ($Opt{vdistro}) {
 257         $excuse_string = "selected distro '$Opt{vdistro}'";
 258         $arr = [grep {$_->{distversion} eq $Opt{vdistro}} @$arr];
 259         ($selected_release_distrov) = $arr->[0]{distversion};
 260     } else {
 261         $excuse_string = "any distro";
 262         my $last_addition;
 263         my %seen;
 264         for my $report (sort { $a->{id} <=> $b->{id} } @$arr) {
 265             unless ($seen{$report->{distversion}}++) {
 266                 $last_addition = $report->{distversion};
 267             }
 268         }
 269         $arr = [grep {$_->{distversion} eq $last_addition} @$arr];
 270         ($selected_release_distrov) = $last_addition;
 271     }
 272     unless ($selected_release_distrov) {
 273         warn "Warning: could not find $excuse_string in '$ctarget'";
 274         return;
 275     }
 276     print STDERR "SELECTED: $selected_release_distrov\n" unless $Opt{quiet};
 277     my @all;
 278     for my $test (@$arr) {
 279         my $id = $test->{id};
 280         push @all, {id=>$id};
 281         return if $Signal;
 282     }
 283     @all = sort { $b->{id} <=> $a->{id} } @all;
 284     return \@all;
 285 }
 286
 287 sub parse_single_report {
 288     my($report, $dumpvars, %Opt) = @_;
 289     my($id) = $report->{id};
 290     $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
 291     my $nnt_dir = "$Opt{cachedir}/nntp-testers";
 292     mkpath $nnt_dir;
 293     my $target = "$nnt_dir/$id";
 294     if ($Opt{local}) {
 295         unless (-e $target) {
 296             die {severity=>0,text=>"Warning: No local file '$target' found, skipping\n"};
 297         }
 298     } else {
 299         if (! -e $target) {
 300             print STDERR "Fetching $target..." if $Opt{verbose} && !$Opt{quiet};
 301             $Opt{transport} ||= $default_transport;
 302             if ($Opt{transport} eq "nntp") {
 303                 my $article = _nntp->article($id);
 304                 unless ($article) {
 305                     die {severity=>0,text=>"NNTP-Server did not return an article for id[$id]"};
 306                 }
 307                 open my $fh, ">", $target or die {severity=>1,text=>"Could not open >$target: $!"};
 308                 print $fh @$article;
 309             } elsif ($Opt{transport} eq "http") {
 310                 my $resp = _ua->mirror("http://www.nntp.perl.org/group/perl.cpan.testers/$id",$target);
 311                 if ($resp->is_success) {
 312                     if ($Opt{verbose}) {
 313                         my(@stat) = stat $target;
 314                         my $timestamp = gmtime $stat[9];
 315                         print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
 316                         if ($Opt{verbose} > 1) {
 317                             print STDERR $resp->headers->as_string unless $Opt{quiet};
 318                         }
 319                     }
 320                     my $headers = "$target.headers";
 321                     open my $fh, ">", $headers or die {severity=>1,text=>"Could not open >$headers: $!"};
 322                     print $fh $resp->headers->as_string;
 323                 } else {
 324                     die {severity=>0,
 325                              text=>sprintf "HTTP Server Error[%s] for id[%s]", $resp->status_line, $id};
 326                 }
 327             } else {
 328                 die {severity=>1,text=>"Illegal value for --transport: '$Opt{transport}'"};
 329             }
 330         }
 331     }
 332     parse_report($target, $dumpvars, %Opt);
 333 }
 334
 335 sub parse_distro {
 336     my($distro,%Opt) = @_;
 337     my %dumpvars;
 338     $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
 339     my $cts_dir = "$Opt{cachedir}/cpantesters-show";
 340     mkpath $cts_dir;
 341     if ($Opt{solve}) {
 342         require Statistics::Regression;
 343         $Opt{dumpvars} = "." unless defined $Opt{dumpvars};
 344     }
 345     if (!$Opt{vdistro} && $distro =~ /^(.+)-(?i:v?\d+)(?:\.\d+)*\w*$/) {
 346         $Opt{vdistro} = $distro;
 347         $distro = $1;
 348     }
 349     my $reports;
 350     if (my $ctdb = $Opt{ctdb}) {
 351         require CPAN::WWW::Testers::Generator::Database;
 352         require CPAN::DistnameInfo;
 353         my $dbi = CPAN::WWW::Testers::Generator::Database->new(database=>$ctdb) or die "Alert: unknown error while opening database '$ctdb'";
 354         unless ($Opt{vdistro}) {
 355             my $sql = "select version from cpanstats where dist=? order by id";
 356             my @rows = $dbi->get_query($sql,$distro);
 357             my($newest,%seen);
 358             for my $row (@rows) {
 359                 $newest = $row->[0] unless $seen{$row->[0]}++;
 360             }
 361             $Opt{vdistro} = "$distro-$newest";
 362         }
 363         my $d = CPAN::DistnameInfo->new("FOO/$Opt{vdistro}.tgz");
 364         my $dist = $d->dist;
 365         my $version = $d->version;
 366         my $sql = "select id from cpanstats where dist=? and version=? order by id desc";
 367         my @rows = $dbi->get_query($sql,$dist,$version);
 368         my @all;
 369         for my $row (@rows) {
 370             my $id = $row->[0];
 371             push @all, {id=>$id};
 372         }
 373         $reports = \@all;
 374     } else {
 375         my $ctarget = _download_overview($cts_dir, $distro, %Opt);
 376         $Opt{ctformat} ||= $default_ctformat;
 377         if ($Opt{ctformat} eq "html") {
 378             $reports = _parse_html($ctarget,%Opt);
 379         } else {
 380             $reports = _parse_yaml($ctarget,%Opt);
 381         }
 382     }
 383     return unless $reports;
 384     for my $report (@$reports) {
 385         eval {parse_single_report($report, \%dumpvars, %Opt)};
 386         if ($@) {
 387             if (ref $@) {
 388                 if ($@->{severity}) {
 389                     die $@->{text};
 390                 } else {
 391                     warn $@->{text};
 392                 }
 393             } else {
 394                 die $@;
 395             }
 396         }
 397         last if $Signal;
 398     }
 399     if ($Opt{dumpvars}) {
 400         require YAML::Syck;
 401         my $dumpfile = $Opt{dumpfile} || "ctgetreports.out";
 402         open my $fh, ">", $dumpfile or die "Could not open '$dumpfile' for writing: $!";
 403         print $fh YAML::Syck::Dump(\%dumpvars);
 404         close $fh or die "Could not close '$dumpfile': $!"
 405     }
 406     if ($Opt{solve}) {
 407         solve(\%dumpvars,%Opt);
 408     }
 409 }
 410
 411 =head2 $bool = _looks_like_qp($raw_report)
 412
 413 We had to acknowledge the fact that some MTAs swallow the MIME-Version
 414 header while passing MIME through. So we introduce fallback heuristics
 415 that try to determine if a report is written in quoted printable.
 416
 417 Note that this subroutine is internal, just documented to have the
 418 internals documented.
 419
 420 The current implementation counts the number of QP escaped spaces and
 421 equal signs.
 422
 423 =cut
 424
 425 sub _looks_like_qp {
 426     my($report) = @_;
 427     my $count_space = () = $report =~ /=20/g;
 428     return 1 if $count_space > 12;
 429     my $count_equal = () = $report =~ /=3D/g;
 430     return 1 if $count_equal > 12;
 431     return 1 if $count_space+$count_equal > 24;
 432     return 0; # waiting for a counter example
 433 }
 434
 435 =head2 $extract = parse_report($target,$dumpvars,%Opt)
 436
 437 Reads one report. $target is the local filename to read. $dumpvars is
 438 a hashref which gets filled with descriptive stats about
 439 PASS/FAIL/etc. %Opt are the options as described in the
 440 C<ctgetreports> manpage. $extract is a hashref containing the found
 441 variables.
 442
 443 Note: this parsing is a bit dirty but as it seems good enough I'm not
 444 inclined to change it. We parse HTML with regexps only, not an HTML
 445 parser. Only the entities are decoded.
 446
 447 Update around version 0.0.17: switching to nntp now but keeping the
 448 parser for HTML around to read old local copies.
 449
 450 Update around 0.0.18: In %Options you can use
 451
 452     article => $some_full_article_as_scalar
 453
 454 to use this function to parse one full article as text. When this is
 455 given, the argument $target is not read, but its basename is taken to
 456 be the id of the article. (OMG, hackers!)
 457
 458 =cut
 459 sub parse_report {
 460     my($target,$dumpvars,%Opt) = @_;
 461     our @q;
 462     my $id = basename($target);
 463     # warn "DEBUG: id[$id]";
 464     my($ok,$about);
 465
 466     my(%extract);
 467
 468     my($report,$isHTML);
 469     if ($report = $Opt{article}) {
 470         $isHTML = $report =~ /^</;
 471         undef $target;
 472     }
 473     if ($target) {
 474         open my $fh, $target or die "Could not open '$target': $!";
 475         local $/;
 476         my $raw_report = <$fh>;
 477         $isHTML = $raw_report =~ /^</;
 478         if ($isHTML) {
 479             $report = decode_entities($raw_report);
 480         } elsif ($raw_report =~ /^MIME-Version: 1.0$/m
 481                  ||
 482                  _looks_like_qp($raw_report)
 483                 ) {
 484             # minimizing MIME effort; don't know about reports in other formats
 485             $report = MIME::QuotedPrint::decode_qp($raw_report);
 486         } else {
 487             $report = $raw_report;
 488         }
 489         close $fh;
 490     }
 491     my @qr = map /^qr:(.+)/, @{$Opt{q}};
 492     if ($Opt{raw} || @qr) {
 493         for my $qr (@qr) {
 494             my $cqr = eval "qr{$qr}";
 495             die "Could not compile regular expression '$qr': $@" if $@;
 496             my(@matches) = $report =~ $cqr;
 497             my $v;
 498             if (@matches) {
 499                 if (@matches==1) {
 500                     $v = $matches[0];
 501                 } else {
 502                     $v = join "", map {"($_)"} @matches;
 503                 }
 504             } else {
 505                 $v = "";
 506             }
 507             $extract{"qr:$qr"} = $v;
 508         }
 509     }
 510
 511     my $report_writer;
 512     my $moduleunpack = {};
 513     my $expect_prereq = 0;
 514     my $expect_toolchain = 0;
 515     my $expecting_toolchain_soon = 0;
 516
 517     my $in_summary = 0;
 518     my $in_summary_seen_platform = 0;
 519     my $in_prg_output = 0;
 520     my $in_env_context = 0;
 521
 522     my $current_headline;
 523     my @previous_line = ""; # so we can neutralize line breaks
 524     my @rlines = split /\r?\n/, $report;
 525  LINE: for (@rlines) {
 526         next LINE unless ($isHTML ? m/<title>(\S+)\s+(\S+)/ : m/^Subject:\s*(\S+)\s+(\S+)/);
 527         $ok = $1;
 528         $about = $2;
 529         $extract{"meta:ok"}    = $ok;
 530         $extract{"meta:about"} = $about;
 531         last;
 532     }
 533  LINE: while (@rlines) {
 534         $_ = shift @rlines;
 535         while (/!$/ and @rlines) {
 536             my $followupline = shift @rlines;
 537             $followupline =~ s/^\s+//; # remo leading space
 538             $_ .= $followupline;
 539         }
 540         if (/^--------/ && $previous_line[-2] && $previous_line[-2] =~ /^--------/) {
 541             $current_headline = $previous_line[-1];
 542             if ($current_headline =~ /PROGRAM OUTPUT/) {
 543                 $in_prg_output = 1;
 544             } else {
 545                 $in_prg_output = 0;
 546             }
 547             if ($current_headline =~ /ENVIRONMENT AND OTHER CONTEXT/) {
 548                 $in_env_context = 1;
 549             } else {
 550                 $in_env_context = 0;
 551             }
 552         }
 553         if ($extract{"meta:perl"}) {
 554             if (    $in_summary
 555                 and !$extract{"conf:git_commit_id"}
 556                 and /Commit id:\s*([[:xdigit:]]+)/) {
 557                 $extract{"conf:git_commit_id"} = $1;
 558             }
 559         } else {
 560             my $p5;
 561             if (0) {
 562             } elsif (/Summary of my perl5 \((.+)\) configuration:/) {
 563                 $p5 = $1;
 564                 $in_summary = 1;
 565                 $in_env_context = 0;
 566             }
 567             if ($p5) {
 568                 my($r,$v,$s,$p);
 569                 if (($r,$v,$s,$p) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+) patch (\S+)/) {
 570                     $r =~ s/\.0//; # 5.0 6 2!
 571                     $extract{"meta:perl"} = "$r.$v.$s\@$p";
 572                 } elsif (($r,$v,$s) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+)/) {
 573                     $r =~ s/\.0//;
 574                     $extract{"meta:perl"} = "$r.$v.$s";
 575                 } elsif (($r,$v,$s) = $p5 =~ /(\d+\S*) patchlevel (\S+) subversion (\S+)/) {
 576                     $r =~ s/\.0//;
 577                     $extract{"meta:perl"} = "$r.$v.$s";
 578                 } else {
 579                     $extract{"meta:perl"} = $p5;
 580                 }
 581             }
 582         }
 583         unless ($extract{"meta:from"}) {
 584             if (0) {
 585             } elsif ($isHTML ?
 586                      m|<div class="h_name">From:</div> <b>(.+?)</b><br/>| :
 587                      m|^From:\s*(.+)|
 588                     ) {
 589                 $extract{"meta:from"} = $1;
 590             }
 591             $extract{"meta:from"} =~ s/\.$// if $extract{"meta:from"};
 592         }
 593         unless ($extract{"meta:date"}) {
 594             if (0) {
 595             } elsif ($isHTML ?
 596                      m|<div class="h_name">Date:</div> (.+?)<br/>| :
 597                      m|^Date:\s*(.+)|
 598                     ) {
 599                 my $date = $1;
 600                 my($dt);
 601                 if ($isHTML) {
 602                     my $p;
 603                     $p = DateTime::Format::Strptime->new(
 604                                                          locale => "en",
 605                                                          time_zone => "UTC",
 606                                                          # April 13, 2005 23:50
 607                                                          pattern => "%b %d, %Y %R",
 608                                                         );
 609                     $dt = $p->parse_datetime($date);
 610                 } else {
 611                     # Sun, 28 Sep 2008 12:23:12 +0100 # but was not consistent
 612                     # pattern => "%a, %d %b %Y %T %z",
 613                     $dt = eval { DateTime::Format::DateParse->parse_datetime($date) };
 614                 }
 615                 unless ($dt) {
 616                     warn "Could not parse date[$date], setting to epoch 0";
 617                     $dt = DateTime->from_epoch( epoch => 0 );
 618                 }
 619                 $extract{"meta:date"} = $dt->datetime;
 620             }
 621             $extract{"meta:date"} =~ s/\.$// if $extract{"meta:date"};
 622         }
 623         unless ($extract{"meta:writer"}) {
 624             for ("$previous_line[-1] $_") {
 625                 if (0) {
 626                 } elsif (/CPANPLUS, version (\S+)/) {
 627                     $extract{"meta:writer"} = "CPANPLUS $1";
 628                 } elsif (/created (?:automatically )?by (\S+)/) {
 629                     $extract{"meta:writer"} = $1;
 630                 } elsif (/This report was machine-generated by (\S+) (\S+)/) {
 631                     $extract{"meta:writer"} = "$1 $2";
 632                 }
 633                 $extract{"meta:writer"} =~ s/[\.,]$// if $extract{"meta:writer"};
 634             }
 635         }
 636         if ($in_summary) {
 637             # we do that first three lines a bit too often
 638             my $qr = $Opt{dumpvars} || "";
 639             $qr = qr/$qr/ if $qr;
 640             unless (@q) {
 641                 @q = @{$Opt{q}||[]};
 642                 @q = qw(meta:perl conf:archname conf:usethreads conf:optimize meta:writer meta:from) unless @q;
 643             }
 644
 645             my %conf_vars = map {($_ => 1)} grep { /^conf:/ } @q;
 646
 647             if (/^\s+Platform:$/) {
 648                 $in_summary_seen_platform=1;
 649             } elsif (/^\s*$/ || m|</pre>|) {
 650                 # if not html, we have reached the end now
 651                 if ($in_summary_seen_platform) {
 652                     # some perls have an empty line after the summary line
 653                     $in_summary = 0;
 654                 }
 655             } else {
 656                 my(%kv) = /\G,?\s*([^=]+)=('[^']+?'|\S+)/gc;
 657                 while (my($k,$v) = each %kv) {
 658                     my $ck = "conf:$k";
 659                     $ck =~ s/\s+$//;
 660                     $v =~ s/,$//;
 661                     if ($v =~ /^'(.*)'$/) {
 662                         $v = $1;
 663                     }
 664                     $v =~ s/^\s+//;
 665                     $v =~ s/\s+$//;
 666                     if ($qr && $ck =~ $qr) {
 667                         $extract{$ck} = $v;
 668                     } elsif ($conf_vars{$ck}) {
 669                         $extract{$ck} = $v;
 670                     }
 671                 }
 672             }
 673         }
 674         if ($in_prg_output) {
 675             unless ($extract{"meta:output_from"}) {
 676                 if (/Output from (.+):$/) {
 677                     $extract{"meta:output_from"} = $1
 678                 }
 679             }
 680         }
 681         if ($in_env_context) {
 682             if (/^\s{4}(\S+)\s*=\s*(.*)$/) {
 683                 $extract{"env:$1"} = $2;
 684             }
 685         }
 686         push @previous_line, $_;
 687         if ($expect_prereq || $expect_toolchain) {
 688             if (/Perl module toolchain versions installed/) {
 689                 # first time discovered in CPANPLUS 0.89_06
 690                 $expecting_toolchain_soon = 1;
 691                 $expect_prereq=0;
 692                 next LINE;
 693             }
 694             if (exists $moduleunpack->{type}) {
 695                 my($module,$v,$needwant);
 696                 # type 1 and 2 are about prereqs, type three about toolchain
 697                 if ($moduleunpack->{type} == 1) {
 698                     (my $leader,$module,$needwant,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
 699                     next LINE if $@;
 700                     if ($leader =~ /^-/) {
 701                         $moduleunpack = {};
 702                         $expect_prereq = 0;
 703                         next LINE;
 704                     } elsif ($leader =~ /^(
 705                                          buil          # build_requires:
 706                                         )/x) {
 707                         next LINE;
 708                     } elsif ($module =~ /^(
 709                                          -             # line drawing
 710                                         )/x) {
 711                         next LINE;
 712                     }
 713                 } elsif ($moduleunpack->{type} == 2) {
 714                     (my $leader,$module,$v,$needwant) = eval { unpack $moduleunpack->{tpl}, $_; };
 715                     next LINE if $@;
 716                     if ($leader =~ /^\*/) {
 717                         $moduleunpack = {};
 718                         $expect_prereq = 0;
 719                         next LINE;
 720                     } elsif ($v =~ /\s/) {
 721                         ($module,$v) = split " ", $_;
 722                     }
 723                 } elsif ($moduleunpack->{type} == 3) {
 724                     (my $leader,$module,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
 725                     next LINE if $@;
 726                     if (!$module) {
 727                         $moduleunpack = {};
 728                         $expect_toolchain = 0;
 729                         next LINE;
 730                     } elsif ($module =~ /^-/) {
 731                         next LINE;
 732                     }
 733                 }
 734                 $module =~ s/\s+$//;
 735                 if ($module) {
 736                     $v =~ s/^\s+//;
 737                     $v =~ s/\s+$//;
 738                     my($modulename,$versionlead) = split " ", $module;
 739                     if (defined $modulename and defined $versionlead) {
 740                         $module = $modulename;
 741                         $v = "$versionlead$v";
 742                     }
 743                     if ($v eq "Have") {
 744                         next LINE;
 745                     }
 746                     $extract{"mod:$module"} = $v;
 747                     if ($needwant) {
 748                         $needwant =~ s/^\s+//;
 749                         $needwant =~ s/\s+$//;
 750                         $extract{"prereq:$module"} = $needwant;
 751                     }
 752                 }
 753             }
 754             if (/(\s+)(Module\s+)(Need\s+)Have/) {
 755                 $in_env_context = 0;
 756                 $moduleunpack = {
 757                                  tpl => 'a'.length($1).'a'.length($2).'a'.length($3).'a*',
 758                                  type => 1,
 759                                 };
 760             } elsif (/(\s+)(Module Name\s+)(Have)(\s+)Want/) {
 761                 $in_env_context = 0;
 762                 my $adjust_1 = 0;
 763                 my $adjust_2 = -length($4);
 764                 my $adjust_3 = length($4);
 765                 # two pass would be required to see where the
 766                 # columns really are. Or could we get away with split?
 767                 $moduleunpack = {
 768                                  tpl => 'a'.length($1).'a'.(length($2)+$adjust_2).'a'.(length($3)+$adjust_3).'a*',
 769                                  type => 2,
 770                                 };
 771             }
 772         }
 773         if (/PREREQUISITES|Prerequisite modules loaded/) {
 774             $in_env_context = 0;
 775             $expect_prereq=1;
 776         }
 777         if ($expecting_toolchain_soon) {
 778             if (/(\s+)(Module(?:\sName)?\s+) Have/) {
 779                 $in_env_context = 0;
 780                 $expect_toolchain=1;
 781                 $expecting_toolchain_soon=0;
 782                 $moduleunpack = {
 783                                  tpl => 'a'.length($1).'a'.length($2).'a*',
 784                                  type => 3,
 785                                 };
 786             }
 787         }
 788         if (/toolchain versions installed/) {
 789             $in_env_context = 0;
 790             $expecting_toolchain_soon=1;
 791         }
 792     }                           # LINE
 793     if ($Opt{solve}) {
 794         $extract{id} = $id;
 795         if ($extract{"conf:osvers"} && $extract{"conf:archname"}) {
 796             $extract{"conf:archame+osvers"} = join " ", @extract{"conf:archname","conf:osvers"};
 797         }
 798         my $data = $dumpvars->{"==DATA=="} ||= [];
 799         push @$data, \%extract;
 800     }
 801     # ---- %extract finished ----
 802     my $diag = "";
 803     if (my $qr = $Opt{dumpvars}) {
 804         $qr = qr/$qr/;
 805         while (my($k,$v) = each %extract) {
 806             if ($k =~ $qr) {
 807                 $dumpvars->{$k}{$v}{$ok}++;
 808             }
 809         }
 810     }
 811     for my $want (@q) {
 812         my $have  = $extract{$want} || "";
 813         $diag .= " $want\[$have]";
 814     }
 815     printf STDERR " %-4s %8d%s\n", $ok, $id, $diag unless $Opt{quiet};
 816     if ($Opt{raw}) {
 817         $report =~ s/\s+\z//;
 818         print STDERR $report, "\n================\n" unless $Opt{quiet};
 819     }
 820     if ($Opt{interactive}) {
 821         require IO::Prompt;
 822         local @ARGV;
 823         local $ARGV;
 824         my $ans = IO::Prompt::prompt
 825             (
 826              -p => "View $id? [onechar: ynq] ",
 827              -d => "y",
 828              -u => qr/[ynq]/,
 829              -onechar,
 830             );
 831         print STDERR "\n" unless $Opt{quiet};
 832         if ($ans eq "y") {
 833             open my $ifh, "<", $target or die "Could not open $target: $!";
 834             $Opt{pager} ||= "less";
 835             open my $lfh, "|-", $Opt{pager} or die "Could not fork '$Opt{pager}': $!";
 836             local $/;
 837             print {$lfh} <$ifh>;
 838             close $ifh or die "Could not close $target: $!";
 839             close $lfh or die "Could not close pager: $!"
 840         } elsif ($ans eq "q") {
 841             $Signal++;
 842             return;
 843         }
 844     }
 845     return \%extract;
 846 }
 847
 848 =head2 solve
 849
 850 Feeds a couple of potentially interesting data to
 851 Statistics::Regression and sorts the result by R^2 descending. Do not
 852 confuse this with a prove, rather take it as a useful hint. It can
 853 save you minutes of staring at data and provide a quick overview where
 854 one should look closer. Displays the N top candidates, where N
 855 defaults to 3 and can be set with the C<$Opt{solvetop}> variable.
 856 Regressions results with an R^2 of 1.00 are displayed in any case.
 857
 858 The function is called when the option C<-solve> is give on the
 859 commandline. Several extra config variables are calculated, see source
 860 code for details.
 861
 862 =cut
 863 {
 864     my %never_solve_on = map {($_ => 1)}
 865         (
 866          "conf:ccflags",
 867          "conf:config_args",
 868          "conf:cppflags",
 869          "conf:lddlflags",
 870          "conf:uname",
 871          "env:PATH",
 872          "env:PERL5LIB",
 873          "env:PERL5OPT",
 874          'env:$^X',
 875          'env:$EGID',
 876          'env:$GID',
 877          'env:$UID/$EUID',
 878          'env:PERL5_CPANPLUS_IS_RUNNING',
 879          'env:PERL5_CPAN_IS_RUNNING',
 880          'env:PERL5_CPAN_IS_RUNNING_IN_RECURSION',
 881          'meta:ok',
 882         );
 883     my %normalize_numeric =
 884         (
 885          id => sub { return shift },
 886          'meta:date' => sub {
 887              my $v = shift;
 888              my($Y,$M,$D,$h,$m,$s) = $v =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)/;
 889              unless (defined $M) {
 890                  die "illegal value[$v] for a date";
 891              }
 892              Time::Local::timegm($s,$m,$h,$D,$M-1,$Y);
 893          },
 894         );
 895     my %normalize_value =
 896         (
 897          'meta:perl' => sub {
 898              my($perlatpatchlevel) = shift;
 899              my $perl = $perlatpatchlevel;
 900              $perl =~ s/\@.*//;
 901              $perl;
 902          },
 903         );
 904 sub solve {
 905     my($V,%Opt) = @_;
 906     require Statistics::Regression;
 907     my @regression;
 908     my $ycb;
 909     if (my $ycbbody = $Opt{ycb}) {
 910         $ycb = eval('sub {'.$ycbbody.'}');
 911         die if $@;
 912     } else {
 913         $ycb = sub {
 914             my $rec = shift;
 915             my $y;
 916             if ($rec->{"meta:ok"} eq "PASS") {
 917                 $y = 1;
 918             } elsif ($rec->{"meta:ok"} eq "FAIL") {
 919                 $y = 0;
 920             }
 921             return $y
 922         };
 923     }
 924   VAR: for my $variable (sort keys %$V) {
 925         next if $variable eq "==DATA==";
 926         if ($never_solve_on{$variable}){
 927             warn "Skipping '$variable'\n" unless $Opt{quiet};
 928             next VAR;
 929         }
 930         my $value_distribution = $V->{$variable};
 931         my $keys = keys %$value_distribution;
 932         my @X = qw(const);
 933         if ($normalize_numeric{$variable}) {
 934             push @X, "n_$variable";
 935         } else {
 936             my %seen = ();
 937             for my $value (sort keys %$value_distribution) {
 938                 my $pf = $value_distribution->{$value};
 939                 $pf->{PASS} ||= 0;
 940                 $pf->{FAIL} ||= 0;
 941                 if ($pf->{PASS} || $pf->{FAIL}) {
 942                     my $Xele = sprintf "eq_%s",
 943                         (
 944                          $normalize_value{$variable} ?
 945                          $normalize_value{$variable}->($value) :
 946                          $value
 947                         );
 948                     push @X, $Xele unless $seen{$Xele}++;
 949
 950                 }
 951                 if (
 952                     $pf->{PASS} xor $pf->{FAIL}
 953                    ) {
 954                     my $vl = 40;
 955                     substr($value,$vl) = "..." if length $value > 3+$vl;
 956                     my $poor_mans_freehand_estimation = 0;
 957                     if ($poor_mans_freehand_estimation) {
 958                         warn sprintf
 959                             (
 960                              "%4d %4d %-23s | %s\n",
 961                              $pf->{PASS},
 962                              $pf->{FAIL},
 963                              $variable,
 964                              $value,
 965                             );
 966                     }
 967                 }
 968             }
 969         }
 970         warn "variable[$variable]keys[$keys]X[@X]\n" unless $Opt{quiet};
 971         next VAR unless @X > 1;
 972         my %regdata =
 973             (
 974              X => \@X,
 975              data => [],
 976             );
 977       RECORD: for my $rec (@{$V->{"==DATA=="}}) {
 978             my $y = $ycb->($rec);
 979             next RECORD unless defined $y;
 980             my %obs;
 981             $obs{Y} = $y;
 982             @obs{@X} = (0) x @X;
 983             $obs{const} = 1;
 984             for my $x (@X) {
 985                 if ($x =~ /^eq_(.+)/) {
 986                     my $read_v = $1;
 987                     if (exists $rec->{$variable}
 988                         && defined $rec->{$variable}
 989                        ) {
 990                         my $use_v = (
 991                                      $normalize_value{$variable} ?
 992                                      $normalize_value{$variable}->($rec->{$variable}) :
 993                                      $rec->{$variable}
 994                                     );
 995                         if ($use_v eq $read_v) {
 996                             $obs{$x} = 1;
 997                         }
 998                     }
 999                     # warn "DEBUG: y[$y]x[$x]obs[$obs{$x}]\n";
1000                 } elsif ($x =~ /^n_(.+)/) {
1001                     my $v = $1;
1002                     $obs{$x} = eval { $normalize_numeric{$v}->($rec->{$v}); };
1003                     if ($@) {
1004                         warn "Warning: error during parsing v[$v] in record[$rec->{id}]: $@; continuing with undef value";
1005                     }
1006                 }
1007             }
1008             push @{$regdata{data}}, \%obs;
1009         }
1010         _run_regression ($variable, \%regdata, \@regression, \%Opt);
1011     }
1012     my $top = min ($Opt{solvetop} || 3, scalar @regression);
1013     my $max_rsq = sum map {1==$_->rsq ? 1 : 0} @regression;
1014     $top = $max_rsq if $max_rsq && $max_rsq > $top;
1015     my $score = 0;
1016     printf
1017         (
1018          "State after regression testing: %d results, showing top %d\n\n",
1019          scalar @regression,
1020          $top,
1021         );
1022     for my $reg (sort {
1023                      $b->rsq <=> $a->rsq
1024                      ||
1025                      $a->k <=> $b->k
1026                  } @regression) {
1027         printf "(%d)\n", ++$score;
1028         eval { $reg->print; };
1029         if ($@) {
1030             printf "\n\nOops, Statistics::Regression died during ->print() with error message[$@]\n\n";
1031         }
1032         last if --$top <= 0;
1033     }
1034 }
1035 }
1036
1037 # $variable is the name we pass through to S:R constructor
1038 # $regdata is hash and has the arrays "X" and "data" (observations)
1039 # X goes to S:R constructor
1040 # each observation has a Y which we pass to S:R in an include() call
1041 # $regression is the collector array of results
1042 # $opt are the options from outside, used to see if we are "verbose"
1043 sub _run_regression {
1044     my($variable,$regdata,$regression,$opt) = @_;
1045     my @X = @{$regdata->{X}};
1046     # my $splo = $regdata->{"spliced-out"} = []; # maybe can be used to
1047                                                # hold the reference
1048                                                # group
1049     while (@X > 1) {
1050         my $reg = Statistics::Regression->new($variable,\@X);
1051         for my $obs (@{$regdata->{data}}) {
1052             my $y = delete $obs->{Y};
1053             $reg->include($y, $obs);
1054             $obs->{Y} = $y;
1055         }
1056         eval {$reg->theta;
1057               my @e = $reg->standarderrors;
1058               die "found standarderrors == 0" if grep { 0 == $_ } @e;
1059               $reg->rsq;};
1060         if ($@) {
1061             if ($opt->{verbose} && $opt->{verbose}>=2) {
1062                 require YAML::Syck;
1063                 warn YAML::Syck::Dump
1064                     ({error=>"could not determine some regression parameters",
1065                       variable=>$variable,
1066                       k=>$reg->k,
1067                       n=>$reg->n,
1068                       X=>$regdata->{"X"},
1069                       errorstr => $@,
1070                      });
1071             }
1072             # reduce k in case that linear dependencies disturbed us;
1073             # often called reference group; I'm tempted to collect and
1074             # make visible
1075             splice @X, 1, 1;
1076         } else {
1077             # $reg->print;
1078             push @$regression, $reg;
1079             return;
1080         }
1081     }
1082 }
1083
1084 =head1 AUTHOR
1085
1086 Andreas König
1087
1088 =head1 BUGS
1089
1090 Please report any bugs or feature requests through the web
1091 interface at
1092 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CPAN-Testers-ParseReport>.
1093 I will be notified, and then you'll automatically be notified of
1094 progress on your bug as I make changes.
1095
1096 =head1 SUPPORT
1097
1098 You can find documentation for this module with the perldoc command.
1099
1100     perldoc CPAN::Testers::ParseReport
1101
1102
1103 You can also look for information at:
1104
1105 =over 4
1106
1107 =item * RT: CPAN's request tracker
1108
1109 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=CPAN-Testers-ParseReport>
1110
1111 =item * AnnoCPAN: Annotated CPAN documentation
1112
1113 L<http://annocpan.org/dist/CPAN-Testers-ParseReport>
1114
1115 =item * CPAN Ratings
1116
1117 L<http://cpanratings.perl.org/d/CPAN-Testers-ParseReport>
1118
1119 =item * Search CPAN
1120
1121 L<http://search.cpan.org/dist/CPAN-Testers-ParseReport>
1122
1123 =back
1124
1125
1126 =head1 ACKNOWLEDGEMENTS
1127
1128 Thanks to RJBS for module-starter.
1129
1130 =head1 COPYRIGHT & LICENSE
1131
1132 Copyright 2008 Andreas König.
1133
1134 This program is free software; you can redistribute it and/or modify it
1135 under the same terms as Perl itself.
1136
1137
1138 =cut
1139
1140 1; # End of CPAN::Testers::ParseReport