lib/CPAN/Testers/ParseReport.pm

   1 package CPAN::Testers::ParseReport;
   2
   3 use warnings;
   4 use strict;
   5
   6 use Config::Perl::V ();
   7 use DateTime::Format::Strptime;
   8 use File::Basename qw(basename);
   9 use File::Path qw(mkpath);
  10 use HTML::Entities qw(decode_entities);
  11 use LWP::UserAgent;
  12 use List::Util qw(max min sum);
  13 use MIME::QuotedPrint ();
  14 use Net::NNTP ();
  15 use Time::Local ();
  16 use XML::LibXML;
  17 use XML::LibXML::XPathContext;
  18
  19 our $default_ctformat = "yaml";
  20 our $default_transport = "nntp";
  21 our $default_cturl = "http://www.cpantesters.org/show";
  22 our $Signal = 0;
  23
  24 =encoding utf-8
  25
  26 =head1 NAME
  27
  28 CPAN::Testers::ParseReport - parse reports to www.cpantesters.org from various sources
  29
  30 =cut
  31
  32 use version; our $VERSION = qv('0.1.10');
  33
  34 =head1 SYNOPSIS
  35
  36 The documentation in here is normally not needed because the code is
  37 meant to be run from a standalone program, L<ctgetreports>.
  38
  39   ctgetreports --q mod:Moose Devel-Events
  40
  41 =head1 DESCRIPTION
  42
  43 This is the core module for CPAN::Testers::ParseReport. If you're not
  44 looking to extend or alter the behaviour of this module, you probably
  45 want to look at L<ctgetreports> instead.
  46
  47 =head1 OPTIONS
  48
  49 Are described in the L<ctgetreports> manpage and are passed through to
  50 the functions unaltered.
  51
  52 =head1 FUNCTIONS
  53
  54 =head2 parse_distro($distro,%options)
  55
  56 reads the cpantesters HTML page or the YAML file or the local database
  57 for the distro and loops through the reports for the specified or most
  58 recent version of that distro found in these data.
  59
  60 parse_distro() intentionally has no meaningful return value, different
  61 options would require different ones.
  62
  63 =head2 $extract = parse_single_report($report,$dumpvars,%options)
  64
  65 mirrors and reads this report. $report is of the form
  66
  67   { id => number }
  68
  69 $dumpvar is a hashreference that gets filled with data.
  70
  71 $extract is the result of parse_report() described below.
  72
  73 =cut
  74
  75 {
  76     my $ua;
  77     sub _ua {
  78         return $ua if $ua;
  79         $ua = LWP::UserAgent->new
  80             (
  81              keep_alive => 1,
  82              env_proxy => 1,
  83             );
  84         $ua->parse_head(0);
  85         $ua;
  86     }
  87 }
  88
  89 {
  90     my $nntp;
  91     sub _nntp {
  92         return $nntp if $nntp;
  93         $nntp = Net::NNTP->new("nntp.perl.org");
  94         $nntp->group("perl.cpan.testers");
  95         return $nntp;
  96     }
  97 }
  98
  99 {
 100     my $xp;
 101     sub _xp {
 102         return $xp if $xp;
 103         $xp = XML::LibXML->new;
 104         $xp->keep_blanks(0);
 105         $xp->clean_namespaces(1);
 106         my $catalog = __FILE__;
 107         $catalog =~ s|ParseReport.pm$|ParseReport/catalog|;
 108         $xp->load_catalog($catalog);
 109         return $xp;
 110     }
 111 }
 112
 113 sub _download_overview {
 114     my($cts_dir, $distro, %Opt) = @_;
 115     my $format = $Opt{ctformat} ||= $default_ctformat;
 116     my $cturl = $Opt{cturl} ||= $default_cturl;
 117     my $ctarget = "$cts_dir/$distro.$format";
 118     my $cheaders = "$cts_dir/$distro.headers";
 119     if ($Opt{local}) {
 120         unless (-e $ctarget) {
 121             die "Alert: No local file '$ctarget' found, cannot continue\n";
 122         }
 123     } else {
 124         if (! -e $ctarget or -M $ctarget > .25) {
 125             if (-e $ctarget && $Opt{verbose}) {
 126                 my(@stat) = stat _;
 127                 my $timestamp = gmtime $stat[9];
 128                 print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
 129             }
 130             print STDERR "Fetching $ctarget..." if $Opt{verbose} && !$Opt{quiet};
 131             my $uri = "$cturl/$distro.$format";
 132             my $resp = _ua->mirror($uri,$ctarget);
 133             if ($resp->is_success) {
 134                 print STDERR "DONE\n" if $Opt{verbose} && !$Opt{quiet};
 135                 open my $fh, ">", $cheaders or die;
 136                 for ($resp->headers->as_string) {
 137                     print $fh $_;
 138                     if ($Opt{verbose} && $Opt{verbose}>1) {
 139                         print STDERR $_ unless $Opt{quiet};
 140                     }
 141                 }
 142             } elsif (304 == $resp->code) {
 143                 print STDERR "DONE (not modified)\n" if $Opt{verbose} && !$Opt{quiet};
 144                 my $atime = my $mtime = time;
 145                 utime $atime, $mtime, $cheaders;
 146             } else {
 147                 die sprintf
 148                     (
 149                      "No success downloading %s: %s",
 150                      $uri,
 151                      $resp->status_line,
 152                     );
 153             }
 154         }
 155     }
 156     return $ctarget;
 157 }
 158
 159 sub _parse_html {
 160     my($ctarget, %Opt) = @_;
 161     my $content = do { open my $fh, $ctarget or die; local $/; <$fh> };
 162     my $preprocesswithtreebuilder = 0; # not needed since barbie switched to XHTML
 163     if ($preprocesswithtreebuilder) {
 164         require HTML::TreeBuilder;
 165         my $tree = HTML::TreeBuilder->new;
 166         $tree->implicit_tags(1);
 167         $tree->p_strict(1);
 168         $tree->ignore_ignorable_whitespace(0);
 169         $tree->parse_content($content);
 170         $tree->eof;
 171         $content = $tree->as_XML;
 172     }
 173     my $parser = _xp();
 174     my $doc = eval { $parser->parse_string($content) };
 175     my $err = $@;
 176     unless ($doc) {
 177         my $distro = basename $ctarget;
 178         die sprintf "Error while parsing %s\: %s", $distro, $err;
 179     }
 180     my $xc = XML::LibXML::XPathContext->new($doc);
 181     my $nsu = $doc->documentElement->namespaceURI;
 182     $xc->registerNs('x', $nsu) if $nsu;
 183     my($selected_release_ul,$selected_release_distrov,$excuse_string);
 184     my($cparentdiv)
 185         = $nsu ?
 186             $xc->findnodes("/x:html/x:body/x:div[\@id = 'doc']") :
 187                 $doc->findnodes("/html/body/div[\@id = 'doc']");
 188     my(@releasedivs) = $nsu ?
 189         $xc->findnodes("//x:div[x:h2 and x:ul]",$cparentdiv) :
 190             $cparentdiv->findnodes("//div[h2 and ul]");
 191     my $releasediv;
 192     if ($Opt{vdistro}) {
 193         $excuse_string = "selected distro '$Opt{vdistro}'";
 194         my($fallbacktoversion) = $Opt{vdistro} =~ /(\d+\..*)/;
 195         $fallbacktoversion = 0 unless defined $fallbacktoversion;
 196       RELEASE: for my $i (0..$#releasedivs) {
 197             my $picked = "";
 198             my($x) = $nsu ?
 199                 $xc->findvalue("x:h2/x:a[2]/\@name",$releasedivs[$i]) :
 200                     $releasedivs[$i]->findvalue("h2/a[2]/\@name");
 201             if ($x) {
 202                 if ($x eq $Opt{vdistro}) {
 203                     $releasediv = $i;
 204                     $picked = " (picked)";
 205                 }
 206                 print STDERR "FOUND DISTRO: $x$picked\n" unless $Opt{quiet};
 207             } else {
 208                 ($x) = $nsu ?
 209                     $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$i]) :
 210                         $releasedivs[$i]->findvalue("h2/a[1]/\@name");
 211                 if ($x eq $fallbacktoversion) {
 212                     $releasediv = $i;
 213                     $picked = " (picked)";
 214                 }
 215                 print STDERR "FOUND VERSION: $x$picked\n" unless $Opt{quiet};
 216             }
 217         }
 218     } else {
 219         $excuse_string = "any distro";
 220     }
 221     unless (defined $releasediv) {
 222         $releasediv = 0;
 223     }
 224     # using a[1] because a[2] is missing on the first entry
 225     ($selected_release_distrov) = $nsu ?
 226         $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$releasediv]) :
 227             $releasedivs[$releasediv]->findvalue("h2/a[1]/\@name");
 228     ($selected_release_ul) = $nsu ?
 229         $xc->findnodes("x:ul",$releasedivs[$releasediv]) :
 230             $releasedivs[$releasediv]->findnodes("ul");
 231     unless (defined $selected_release_distrov) {
 232         warn "Warning: could not find $excuse_string in '$ctarget'";
 233         return;
 234     }
 235     print STDERR "SELECTED: $selected_release_distrov\n" unless $Opt{quiet};
 236     my($id);
 237     my @all;
 238     for my $test ($nsu ?
 239                   $xc->findnodes("x:li",$selected_release_ul) :
 240                   $selected_release_ul->findnodes("li")) {
 241         $id = $nsu ?
 242             $xc->findvalue("x:a[1]/text()",$test)     :
 243                 $test->findvalue("a[1]/text()");
 244         push @all, {id=>$id};
 245         return if $Signal;
 246     }
 247     return \@all;
 248 }
 249
 250 sub _parse_yaml {
 251     my($ctarget, %Opt) = @_;
 252     require YAML::Syck;
 253     my $arr = YAML::Syck::LoadFile($ctarget);
 254     my($selected_release_ul,$selected_release_distrov,$excuse_string);
 255     if ($Opt{vdistro}) {
 256         $excuse_string = "selected distro '$Opt{vdistro}'";
 257         $arr = [grep {$_->{distversion} eq $Opt{vdistro}} @$arr];
 258         ($selected_release_distrov) = $arr->[0]{distversion};
 259     } else {
 260         $excuse_string = "any distro";
 261         my $last_addition;
 262         my %seen;
 263         for my $report (sort { $a->{id} <=> $b->{id} } @$arr) {
 264             unless ($seen{$report->{distversion}}++) {
 265                 $last_addition = $report->{distversion};
 266             }
 267         }
 268         $arr = [grep {$_->{distversion} eq $last_addition} @$arr];
 269         ($selected_release_distrov) = $last_addition;
 270     }
 271     unless ($selected_release_distrov) {
 272         warn "Warning: could not find $excuse_string in '$ctarget'";
 273         return;
 274     }
 275     print STDERR "SELECTED: $selected_release_distrov\n" unless $Opt{quiet};
 276     my @all;
 277     for my $test (@$arr) {
 278         my $id = $test->{id};
 279         push @all, {id=>$id};
 280         return if $Signal;
 281     }
 282     @all = sort { $b->{id} <=> $a->{id} } @all;
 283     return \@all;
 284 }
 285
 286 sub parse_single_report {
 287     my($report, $dumpvars, %Opt) = @_;
 288     my($id) = $report->{id};
 289     $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
 290     my $nnt_dir = "$Opt{cachedir}/nntp-testers";
 291     mkpath $nnt_dir;
 292     my $target = "$nnt_dir/$id";
 293     if ($Opt{local}) {
 294         unless (-e $target) {
 295             die {severity=>0,text=>"Warning: No local file '$target' found, skipping\n"};
 296         }
 297     } else {
 298         if (! -e $target) {
 299             print STDERR "Fetching $target..." if $Opt{verbose} && !$Opt{quiet};
 300             $Opt{transport} ||= $default_transport;
 301             if ($Opt{transport} eq "nntp") {
 302                 my $article = _nntp->article($id);
 303                 unless ($article) {
 304                     die {severity=>0,text=>"NNTP-Server did not return an article for id[$id]"};
 305                 }
 306                 open my $fh, ">", $target or die {severity=>1,text=>"Could not open >$target: $!"};
 307                 print $fh @$article;
 308             } elsif ($Opt{transport} eq "http") {
 309                 my $resp = _ua->mirror("http://www.nntp.perl.org/group/perl.cpan.testers/$id",$target);
 310                 if ($resp->is_success) {
 311                     if ($Opt{verbose}) {
 312                         my(@stat) = stat $target;
 313                         my $timestamp = gmtime $stat[9];
 314                         print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
 315                         if ($Opt{verbose} > 1) {
 316                             print STDERR $resp->headers->as_string unless $Opt{quiet};
 317                         }
 318                     }
 319                     my $headers = "$target.headers";
 320                     open my $fh, ">", $headers or die {severity=>1,text=>"Could not open >$headers: $!"};
 321                     print $fh $resp->headers->as_string;
 322                 } else {
 323                     die {severity=>0,
 324                              text=>sprintf "HTTP Server Error[%s] for id[%s]", $resp->status_line, $id};
 325                 }
 326             } else {
 327                 die {severity=>1,text=>"Illegal value for --transport: '$Opt{transport}'"};
 328             }
 329         }
 330     }
 331     parse_report($target, $dumpvars, %Opt);
 332 }
 333
 334 sub parse_distro {
 335     my($distro,%Opt) = @_;
 336     my %dumpvars;
 337     $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
 338     my $cts_dir = "$Opt{cachedir}/cpantesters-show";
 339     mkpath $cts_dir;
 340     if ($Opt{solve}) {
 341         require Statistics::Regression;
 342         $Opt{dumpvars} = "." unless defined $Opt{dumpvars};
 343     }
 344     if (!$Opt{vdistro} && $distro =~ /^(.+)-(?i:v?\d+)(?:\.\d+)*\w*$/) {
 345         $Opt{vdistro} = $distro;
 346         $distro = $1;
 347     }
 348     my $reports;
 349     if (my $ctdb = $Opt{ctdb}) {
 350         require CPAN::WWW::Testers::Generator::Database;
 351         require CPAN::DistnameInfo;
 352         my $dbi = CPAN::WWW::Testers::Generator::Database->new(database=>$ctdb) or die "Alert: unknown error while opening database '$ctdb'";
 353         unless ($Opt{vdistro}) {
 354             my $sql = "select version from cpanstats where dist=? order by id";
 355             my @rows = $dbi->get_query($sql,$distro);
 356             my($newest,%seen);
 357             for my $row (@rows) {
 358                 $newest = $row->[0] unless $seen{$row->[0]}++;
 359             }
 360             $Opt{vdistro} = "$distro-$newest";
 361         }
 362         my $d = CPAN::DistnameInfo->new("FOO/$Opt{vdistro}.tgz");
 363         my $dist = $d->dist;
 364         my $version = $d->version;
 365         my $sql = "select id from cpanstats where dist=? and version=? order by id desc";
 366         my @rows = $dbi->get_query($sql,$dist,$version);
 367         my @all;
 368         for my $row (@rows) {
 369             my $id = $row->[0];
 370             push @all, {id=>$id};
 371         }
 372         $reports = \@all;
 373     } else {
 374         my $ctarget = _download_overview($cts_dir, $distro, %Opt);
 375         $Opt{ctformat} ||= $default_ctformat;
 376         if ($Opt{ctformat} eq "html") {
 377             $reports = _parse_html($ctarget,%Opt);
 378         } else {
 379             $reports = _parse_yaml($ctarget,%Opt);
 380         }
 381     }
 382     return unless $reports;
 383     for my $report (@$reports) {
 384         eval {parse_single_report($report, \%dumpvars, %Opt)};
 385         if ($@) {
 386             if (ref $@) {
 387                 if ($@->{severity}) {
 388                     die $@->{text};
 389                 } else {
 390                     warn $@->{text};
 391                 }
 392             } else {
 393                 die $@;
 394             }
 395         }
 396         last if $Signal;
 397     }
 398     if ($Opt{dumpvars}) {
 399         require YAML::Syck;
 400         my $dumpfile = $Opt{dumpfile} || "ctgetreports.out";
 401         open my $fh, ">", $dumpfile or die "Could not open '$dumpfile' for writing: $!";
 402         print $fh YAML::Syck::Dump(\%dumpvars);
 403         close $fh or die "Could not close '$dumpfile': $!"
 404     }
 405     if ($Opt{solve}) {
 406         solve(\%dumpvars,%Opt);
 407     }
 408 }
 409
 410 =head2 $bool = _looks_like_qp($raw_report)
 411
 412 We had to acknowledge the fact that some MTAs swallow the MIME-Version
 413 header while passing MIME through. So we introduce fallback heuristics
 414 that try to determine if a report is written in quoted printable.
 415
 416 Note that this subroutine is internal, just documented to have the
 417 internals documented.
 418
 419 The current implementation counts the number of QP escaped spaces and
 420 equal signs.
 421
 422 =cut
 423
 424 sub _looks_like_qp {
 425     my($report) = @_;
 426     my $count_space = () = $report =~ /=20/g;
 427     return 1 if $count_space > 12;
 428     my $count_equal = () = $report =~ /=3D/g;
 429     return 1 if $count_equal > 12;
 430     return 1 if $count_space+$count_equal > 24;
 431     return 0; # waiting for a counter example
 432 }
 433
 434 =head2 $extract = parse_report($target,$dumpvars,%Opt)
 435
 436 Reads one report. $target is the local filename to read. $dumpvars is
 437 a hashref which gets filled with descriptive stats about
 438 PASS/FAIL/etc. %Opt are the options as described in the
 439 C<ctgetreports> manpage. $extract is a hashref containing the found
 440 variables.
 441
 442 Note: this parsing is a bit dirty but as it seems good enough I'm not
 443 inclined to change it. We parse HTML with regexps only, not an HTML
 444 parser. Only the entities are decoded.
 445
 446 Update around version 0.0.17: switching to nntp now but keeping the
 447 parser for HTML around to read old local copies.
 448
 449 Update around 0.0.18: In %Options you can use
 450
 451     article => $some_full_article_as_scalar
 452
 453 to use this function to parse one full article as text. When this is
 454 given, the argument $target is not read, but its basename is taken to
 455 be the id of the article. (OMG, hackers!)
 456
 457 =cut
 458 sub parse_report {
 459     my($target,$dumpvars,%Opt) = @_;
 460     our @q;
 461     my $id = basename($target);
 462     # warn "DEBUG: id[$id]";
 463     my($ok,$about);
 464
 465     my(%extract);
 466
 467     my($report,$isHTML) = _get_cooked_report($target, \%Opt);
 468     my @qr = map /^qr:(.+)/, @{$Opt{q}};
 469     if ($Opt{raw} || @qr) {
 470         for my $qr (@qr) {
 471             my $cqr = eval "qr{$qr}";
 472             die "Could not compile regular expression '$qr': $@" if $@;
 473             my(@matches) = $report =~ $cqr;
 474             my $v;
 475             if (@matches) {
 476                 if (@matches==1) {
 477                     $v = $matches[0];
 478                 } else {
 479                     $v = join "", map {"($_)"} @matches;
 480                 }
 481             } else {
 482                 $v = "";
 483             }
 484             $extract{"qr:$qr"} = $v;
 485         }
 486     }
 487
 488     my $report_writer;
 489     my $moduleunpack = {};
 490     my $expect_prereq = 0;
 491     my $expect_toolchain = 0;
 492     my $expecting_toolchain_soon = 0;
 493     my $fallback_p5 = "";
 494
 495     my $in_summary = 0;
 496     my $in_summary_seen_platform = 0;
 497     my $in_prg_output = 0;
 498     my $in_env_context = 0;
 499
 500     my $current_headline;
 501     my @previous_line = ""; # so we can neutralize line breaks
 502     my @rlines = split /\r?\n/, $report;
 503  LINE: for (@rlines) {
 504         next LINE unless ($isHTML ? m/<title>(\S+)\s+(\S+)/ : m/^Subject:\s*(\S+)\s+(\S+)/);
 505         $ok = $1;
 506         $about = $2;
 507         $extract{"meta:ok"}    = $ok;
 508         $extract{"meta:about"} = $about;
 509         last;
 510     }
 511  LINE: while (@rlines) {
 512         $_ = shift @rlines;
 513         while (/!$/ and @rlines) {
 514             my $followupline = shift @rlines;
 515             $followupline =~ s/^\s+//; # remo leading space
 516             $_ .= $followupline;
 517         }
 518         if (/^--------/ && $previous_line[-2] && $previous_line[-2] =~ /^--------/) {
 519             $current_headline = $previous_line[-1];
 520             if ($current_headline =~ /PROGRAM OUTPUT/) {
 521                 $in_prg_output = 1;
 522             } else {
 523                 $in_prg_output = 0;
 524             }
 525             if ($current_headline =~ /ENVIRONMENT AND OTHER CONTEXT/) {
 526                 $in_env_context = 1;
 527             } else {
 528                 $in_env_context = 0;
 529             }
 530         }
 531         if ($extract{"meta:perl"}) {
 532             if (    $in_summary
 533                 and !$extract{"conf:git_commit_id"}
 534                 and /Commit id:\s*([[:xdigit:]]+)/) {
 535                 $extract{"conf:git_commit_id"} = $1;
 536             }
 537         } else {
 538             my $p5;
 539             if (0) {
 540             } elsif (/Summary of my perl5 \((.+)\) configuration:/) {
 541                 $p5 = $1;
 542                 $in_summary = 1;
 543                 $in_env_context = 0;
 544             }
 545             if ($p5) {
 546                 my($r,$v,$s,$p);
 547                 if (($r,$v,$s,$p) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+) patch (\S+)/) {
 548                     $r =~ s/\.0//; # 5.0 6 2!
 549                     $extract{"meta:perl"} = "$r.$v.$s\@$p";
 550                 } elsif (($r,$v,$s) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+)/) {
 551                     $r =~ s/\.0//;
 552                     $extract{"meta:perl"} = "$r.$v.$s";
 553                 } elsif (($r,$v,$s) = $p5 =~ /(\d+\S*) patchlevel (\S+) subversion (\S+)/) {
 554                     $r =~ s/\.0//;
 555                     $extract{"meta:perl"} = "$r.$v.$s";
 556                 } else {
 557                     $extract{"meta:perl"} = $p5;
 558                 }
 559             }
 560         }
 561         unless ($extract{"meta:from"}) {
 562             if (0) {
 563             } elsif ($isHTML ?
 564                      m|<div class="h_name">From:</div> <b>(.+?)</b><br/>| :
 565                      m|^From:\s*(.+)|
 566                     ) {
 567                 $extract{"meta:from"} = $1;
 568             }
 569             $extract{"meta:from"} =~ s/\.$// if $extract{"meta:from"};
 570         }
 571         unless ($extract{"meta:date"}) {
 572             if (0) {
 573             } elsif ($isHTML ?
 574                      m|<div class="h_name">Date:</div> (.+?)<br/>| :
 575                      m|^Date:\s*(.+)|
 576                     ) {
 577                 my $date = $1;
 578                 my($dt);
 579             DATEFMT: for my $pat ("%a, %d %b %Y %T %z", # Sun, 28 Sep 2008 12:23:12 +0100
 580                                   "%b %d, %Y %R", # July 10,...
 581                                   "%b  %d, %Y %R", # July  4,...
 582                                  ) {
 583                     $dt = eval {
 584                         my $p = DateTime::Format::Strptime->new
 585                             (
 586                              locale => "en",
 587                              time_zone => "UTC",
 588                              pattern => $pat,
 589                             );
 590                         $p->parse_datetime($date)
 591                     };
 592                     last DATEFMT if $dt;
 593                 }
 594                 unless ($dt) {
 595                     warn "Could not parse date[$date], setting to epoch 0";
 596                     $dt = DateTime->from_epoch( epoch => 0 );
 597                 }
 598                 $extract{"meta:date"} = $dt->datetime;
 599             }
 600             $extract{"meta:date"} =~ s/\.$// if $extract{"meta:date"};
 601         }
 602         unless ($extract{"meta:writer"}) {
 603             for ("$previous_line[-1] $_") {
 604                 if (0) {
 605                 } elsif (/CPANPLUS, version (\S+)/) {
 606                     $extract{"meta:writer"} = "CPANPLUS $1";
 607                 } elsif (/created (?:automatically )?by (\S+)/) {
 608                     $extract{"meta:writer"} = $1;
 609                     if (/\s+on\s+perl\s+(\S+),/) {
 610                         $fallback_p5 = $1;
 611                     }
 612                 } elsif (/This report was machine-generated by (\S+) (\S+)/) {
 613                     $extract{"meta:writer"} = "$1 $2";
 614                 }
 615                 $extract{"meta:writer"} =~ s/[\.,]$// if $extract{"meta:writer"};
 616             }
 617         }
 618         if ($in_summary) {
 619             # we do that first three lines a bit too often
 620             my $qr = $Opt{dumpvars} || "";
 621             $qr = qr/$qr/ if $qr;
 622             unless (@q) {
 623                 @q = @{$Opt{q}||[]};
 624                 @q = qw(meta:perl conf:archname conf:usethreads conf:optimize meta:writer meta:from) unless @q;
 625             }
 626
 627             my %conf_vars = map {($_ => 1)} grep { /^conf:/ } @q;
 628
 629             if (/^\s+Platform:$/) {
 630                 $in_summary_seen_platform=1;
 631             } elsif (/^\s*$/ || m|</pre>|) {
 632                 # if not html, we have reached the end now
 633                 if ($in_summary_seen_platform) {
 634                     # some perls have an empty line after the summary line
 635                     $in_summary = 0;
 636                 }
 637             } else {
 638                 my(%kv) = /\G,?\s*([^=]+)=('[^']+?'|\S+)/gc;
 639                 while (my($k,$v) = each %kv) {
 640                     my $ck = "conf:$k";
 641                     $ck =~ s/\s+$//;
 642                     $v =~ s/,$//;
 643                     if ($v =~ /^'(.*)'$/) {
 644                         $v = $1;
 645                     }
 646                     $v =~ s/^\s+//;
 647                     $v =~ s/\s+$//;
 648                     if ($qr && $ck =~ $qr) {
 649                         $extract{$ck} = $v;
 650                     } elsif ($conf_vars{$ck}) {
 651                         $extract{$ck} = $v;
 652                     }
 653                 }
 654             }
 655         }
 656         if ($in_prg_output) {
 657             unless ($extract{"meta:output_from"}) {
 658                 if (/Output from (.+):$/) {
 659                     $extract{"meta:output_from"} = $1
 660                 }
 661             }
 662         }
 663         if ($in_env_context) {
 664             if (/^\s{4}(\S+)\s*=\s*(.*)$/) {
 665                 $extract{"env:$1"} = $2;
 666             }
 667         }
 668         push @previous_line, $_;
 669         if ($expect_prereq || $expect_toolchain) {
 670             if (/Perl module toolchain versions installed/) {
 671                 # first time discovered in CPANPLUS 0.89_06
 672                 $expecting_toolchain_soon = 1;
 673                 $expect_prereq=0;
 674                 next LINE;
 675             }
 676             if (exists $moduleunpack->{type}) {
 677                 my($module,$v,$needwant);
 678                 # type 1 and 2 are about prereqs, type three about toolchain
 679                 if ($moduleunpack->{type} == 1) {
 680                     (my $leader,$module,$needwant,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
 681                     next LINE if $@;
 682                     if ($leader =~ /^-/) {
 683                         $moduleunpack = {};
 684                         $expect_prereq = 0;
 685                         next LINE;
 686                     } elsif ($leader =~ /^(
 687                                          buil          # build_requires:
 688                                         )/x) {
 689                         next LINE;
 690                     } elsif ($module =~ /^(
 691                                          -             # line drawing
 692                                         )/x) {
 693                         next LINE;
 694                     }
 695                 } elsif ($moduleunpack->{type} == 2) {
 696                     (my $leader,$module,$v,$needwant) = eval { unpack $moduleunpack->{tpl}, $_; };
 697                     next LINE if $@;
 698                     if ($leader =~ /^\*/) {
 699                         $moduleunpack = {};
 700                         $expect_prereq = 0;
 701                         next LINE;
 702                     } elsif (!defined $v
 703                              or !defined $needwant
 704                              or $v =~ /\s/
 705                              or $needwant =~ /\s/
 706                             ) {
 707                         ($module,$v,$needwant) = split " ", $_;
 708                     }
 709                 } elsif ($moduleunpack->{type} == 3) {
 710                     (my $leader,$module,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
 711                     next LINE if $@;
 712                     if (!$module) {
 713                         $moduleunpack = {};
 714                         $expect_toolchain = 0;
 715                         next LINE;
 716                     } elsif ($module =~ /^-/) {
 717                         next LINE;
 718                     }
 719                 }
 720                 $module =~ s/\s+$//;
 721                 if ($module) {
 722                     $v =~ s/^\s+//;
 723                     $v =~ s/\s+$//;
 724                     my($modulename,$versionlead) = split " ", $module;
 725                     if (defined $modulename and defined $versionlead) {
 726                         $module = $modulename;
 727                         $v = "$versionlead$v";
 728                     }
 729                     if ($v eq "Have") {
 730                         next LINE;
 731                     }
 732                     $extract{"mod:$module"} = $v;
 733                     if ($needwant) {
 734                         $needwant =~ s/^\s+//;
 735                         $needwant =~ s/\s+$//;
 736                         $extract{"prereq:$module"} = $needwant;
 737                     }
 738                 }
 739             }
 740             if (/(\s+)(Module\s+)(Need\s+)Have/) {
 741                 $in_env_context = 0;
 742                 $moduleunpack = {
 743                                  tpl => 'a'.length($1).'a'.length($2).'a'.length($3).'a*',
 744                                  type => 1,
 745                                 };
 746             } elsif (/(\s+)(Module Name\s+)(Have)(\s+)Want/) {
 747                 $in_env_context = 0;
 748                 my $adjust_1 = 0;
 749                 my $adjust_2 = -length($4);
 750                 my $adjust_3 = length($4);
 751                 # I think they do not really try to align, usually we
 752                 # get away with split
 753                 $moduleunpack = {
 754                                  tpl => 'a'.length($1).'a'.(length($2)+$adjust_2).'a'.(length($3)+$adjust_3).'a*',
 755                                  type => 2,
 756                                 };
 757             }
 758         }
 759         if (/PREREQUISITES|Prerequisite modules loaded/) {
 760             $in_env_context = 0;
 761             $expect_prereq=1;
 762         }
 763         if ($expecting_toolchain_soon) {
 764             if (/(\s+)(Module(?:\sName)?\s+) Have/) {
 765                 $in_env_context = 0;
 766                 $expect_toolchain=1;
 767                 $expecting_toolchain_soon=0;
 768                 $moduleunpack = {
 769                                  tpl => 'a'.length($1).'a'.length($2).'a*',
 770                                  type => 3,
 771                                 };
 772             }
 773         }
 774         if (/toolchain versions installed/) {
 775             $in_env_context = 0;
 776             $expecting_toolchain_soon=1;
 777         }
 778     }                           # LINE
 779     if (! $extract{"meta:perl"} && $fallback_p5) {
 780         $extract{"meta:perl"} = $fallback_p5;
 781     }
 782     if ($Opt{solve}) {
 783         $extract{id} = $id;
 784         if ($extract{"conf:osvers"} && $extract{"conf:archname"}) {
 785             $extract{"conf:archname+osvers"} = join " ", @extract{"conf:archname","conf:osvers"};
 786         }
 787         my $data = $dumpvars->{"==DATA=="} ||= [];
 788         push @$data, \%extract;
 789     }
 790     # ---- %extract finished ----
 791     my $diag = "";
 792     if (my $qr = $Opt{dumpvars}) {
 793         $qr = qr/$qr/;
 794         while (my($k,$v) = each %extract) {
 795             if ($k =~ $qr) {
 796                 $dumpvars->{$k}{$v}{$ok}++;
 797             }
 798         }
 799     }
 800     for my $want (@q) {
 801         my $have  = $extract{$want} || "";
 802         $diag .= " $want\[$have]";
 803     }
 804     printf STDERR " %-4s %8d%s\n", $ok, $id, $diag unless $Opt{quiet};
 805     if ($Opt{raw}) {
 806         $report =~ s/\s+\z//;
 807         print STDERR $report, "\n================\n" unless $Opt{quiet};
 808     }
 809     if ($Opt{interactive}) {
 810         require IO::Prompt;
 811         local @ARGV;
 812         local $ARGV;
 813         my $ans = IO::Prompt::prompt
 814             (
 815              -p => "View $id? [onechar: ynq] ",
 816              -d => "y",
 817              -u => qr/[ynq]/,
 818              -onechar,
 819             );
 820         print STDERR "\n" unless $Opt{quiet};
 821         if ($ans eq "y") {
 822             my($report) = _get_cooked_report($target, \%Opt);
 823             $Opt{pager} ||= "less";
 824             open my $lfh, "|-", $Opt{pager} or die "Could not fork '$Opt{pager}': $!";
 825             local $/;
 826             print {$lfh} $report;
 827             close $lfh or die "Could not close pager: $!"
 828         } elsif ($ans eq "q") {
 829             $Signal++;
 830             return;
 831         }
 832     }
 833     return \%extract;
 834 }
 835
 836 sub _get_cooked_report {
 837     my($target, $Opt_ref) = @_;
 838     my($report, $isHTML);
 839     if ($report = $Opt_ref->{article}) {
 840         $isHTML = $report =~ /^</;
 841         undef $target;
 842     }
 843     if ($target) {
 844         open my $fh, $target or die "Could not open '$target': $!";
 845         local $/;
 846         my $raw_report = <$fh>;
 847         $isHTML = $raw_report =~ /^</;
 848         if ($isHTML) {
 849             $report = decode_entities($raw_report);
 850         } elsif ($raw_report =~ /^MIME-Version: 1.0$/m
 851                  ||
 852                  _looks_like_qp($raw_report)
 853                 ) {
 854             # minimizing MIME effort; don't know about reports in other formats
 855             $report = MIME::QuotedPrint::decode_qp($raw_report);
 856         } else {
 857             $report = $raw_report;
 858         }
 859         close $fh;
 860     }
 861     if ($report =~ /\r\n/) {
 862         my @rlines = split /\r?\n/, $report;
 863         $report = join "\n", @rlines;
 864     }
 865     ($report, $isHTML);
 866 }
 867
 868 =head2 solve
 869
 870 Feeds a couple of potentially interesting data to
 871 Statistics::Regression and sorts the result by R^2 descending. Do not
 872 confuse this with a prove, rather take it as a useful hint. It can
 873 save you minutes of staring at data and provide a quick overview where
 874 one should look closer. Displays the N top candidates, where N
 875 defaults to 3 and can be set with the C<$Opt{solvetop}> variable.
 876 Regressions results with an R^2 of 1.00 are displayed in any case.
 877
 878 The function is called when the option C<-solve> is give on the
 879 commandline. Several extra config variables are calculated, see source
 880 code for details.
 881
 882 =cut
 883 {
 884     my %never_solve_on = map {($_ => 1)}
 885         (
 886          "conf:ccflags",
 887          "conf:config_args",
 888          "conf:cppflags",
 889          "conf:lddlflags",
 890          "conf:uname",
 891          "env:PATH",
 892          "env:PERL5LIB",
 893          "env:PERL5OPT",
 894          'env:$^X',
 895          'env:$EGID',
 896          'env:$GID',
 897          'env:$UID/$EUID',
 898          'env:PERL5_CPANPLUS_IS_RUNNING',
 899          'env:PERL5_CPAN_IS_RUNNING',
 900          'env:PERL5_CPAN_IS_RUNNING_IN_RECURSION',
 901          'meta:ok',
 902         );
 903     my %normalize_numeric =
 904         (
 905          id => sub { return shift },
 906          'meta:date' => sub {
 907              my $v = shift;
 908              my($Y,$M,$D,$h,$m,$s) = $v =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)/;
 909              unless (defined $M) {
 910                  die "illegal value[$v] for a date";
 911              }
 912              Time::Local::timegm($s,$m,$h,$D,$M-1,$Y);
 913          },
 914         );
 915     my %normalize_value =
 916         (
 917          'meta:perl' => sub {
 918              my($perlatpatchlevel) = shift;
 919              my $perl = $perlatpatchlevel;
 920              $perl =~ s/\@.*//;
 921              $perl;
 922          },
 923         );
 924 sub solve {
 925     my($V,%Opt) = @_;
 926     require Statistics::Regression;
 927     my @regression;
 928     my $ycb;
 929     if (my $ycbbody = $Opt{ycb}) {
 930         $ycb = eval('sub {'.$ycbbody.'}');
 931         die if $@;
 932     } else {
 933         $ycb = sub {
 934             my $rec = shift;
 935             my $y;
 936             if ($rec->{"meta:ok"} eq "PASS") {
 937                 $y = 1;
 938             } elsif ($rec->{"meta:ok"} eq "FAIL") {
 939                 $y = 0;
 940             }
 941             return $y
 942         };
 943     }
 944   VAR: for my $variable (sort keys %$V) {
 945         next if $variable eq "==DATA==";
 946         if ($never_solve_on{$variable}){
 947             warn "Skipping '$variable'\n" unless $Opt{quiet};
 948             next VAR;
 949         }
 950         my $value_distribution = $V->{$variable};
 951         my $keys = keys %$value_distribution;
 952         my @X = qw(const);
 953         if ($normalize_numeric{$variable}) {
 954             push @X, "n_$variable";
 955         } else {
 956             my %seen = ();
 957             for my $value (sort keys %$value_distribution) {
 958                 my $pf = $value_distribution->{$value};
 959                 $pf->{PASS} ||= 0;
 960                 $pf->{FAIL} ||= 0;
 961                 if ($pf->{PASS} || $pf->{FAIL}) {
 962                     my $Xele = sprintf "eq_%s",
 963                         (
 964                          $normalize_value{$variable} ?
 965                          $normalize_value{$variable}->($value) :
 966                          $value
 967                         );
 968                     push @X, $Xele unless $seen{$Xele}++;
 969
 970                 }
 971                 if (
 972                     $pf->{PASS} xor $pf->{FAIL}
 973                    ) {
 974                     my $vl = 40;
 975                     substr($value,$vl) = "..." if length $value > 3+$vl;
 976                     my $poor_mans_freehand_estimation = 0;
 977                     if ($poor_mans_freehand_estimation) {
 978                         warn sprintf
 979                             (
 980                              "%4d %4d %-23s | %s\n",
 981                              $pf->{PASS},
 982                              $pf->{FAIL},
 983                              $variable,
 984                              $value,
 985                             );
 986                     }
 987                 }
 988             }
 989         }
 990         warn "variable[$variable]keys[$keys]X[@X]\n" unless $Opt{quiet};
 991         next VAR unless @X > 1;
 992         my %regdata =
 993             (
 994              X => \@X,
 995              data => [],
 996             );
 997       RECORD: for my $rec (@{$V->{"==DATA=="}}) {
 998             my $y = $ycb->($rec);
 999             next RECORD unless defined $y;
1000             my %obs;
1001             $obs{Y} = $y;
1002             @obs{@X} = (0) x @X;
1003             $obs{const} = 1;
1004             for my $x (@X) {
1005                 if ($x =~ /^eq_(.+)/) {
1006                     my $read_v = $1;
1007                     if (exists $rec->{$variable}
1008                         && defined $rec->{$variable}
1009                        ) {
1010                         my $use_v = (
1011                                      $normalize_value{$variable} ?
1012                                      $normalize_value{$variable}->($rec->{$variable}) :
1013                                      $rec->{$variable}
1014                                     );
1015                         if ($use_v eq $read_v) {
1016                             $obs{$x} = 1;
1017                         }
1018                     }
1019                     # warn "DEBUG: y[$y]x[$x]obs[$obs{$x}]\n";
1020                 } elsif ($x =~ /^n_(.+)/) {
1021                     my $v = $1;
1022                     $obs{$x} = eval { $normalize_numeric{$v}->($rec->{$v}); };
1023                     if ($@) {
1024                         warn "Warning: error during parsing v[$v] in record[$rec->{id}]: $@; continuing with undef value";
1025                     }
1026                 }
1027             }
1028             push @{$regdata{data}}, \%obs;
1029         }
1030         _run_regression ($variable, \%regdata, \@regression, \%Opt);
1031     }
1032     my $top = min ($Opt{solvetop} || 3, scalar @regression);
1033     my $max_rsq = sum map {1==$_->rsq ? 1 : 0} @regression;
1034     $top = $max_rsq if $max_rsq && $max_rsq > $top;
1035     my $score = 0;
1036     printf
1037         (
1038          "State after regression testing: %d results, showing top %d\n\n",
1039          scalar @regression,
1040          $top,
1041         );
1042     for my $reg (sort {
1043                      $b->rsq <=> $a->rsq
1044                      ||
1045                      $a->k <=> $b->k
1046                  } @regression) {
1047         printf "(%d)\n", ++$score;
1048         eval { $reg->print; };
1049         if ($@) {
1050             printf "\n\nOops, Statistics::Regression died during ->print() with error message[$@]\n\n";
1051         }
1052         last if --$top <= 0;
1053     }
1054 }
1055 }
1056
1057 # $variable is the name we pass through to S:R constructor
1058 # $regdata is hash and has the arrays "X" and "data" (observations)
1059 # X goes to S:R constructor
1060 # each observation has a Y which we pass to S:R in an include() call
1061 # $regression is the collector array of results
1062 # $opt are the options from outside, used to see if we are "verbose"
1063 sub _run_regression {
1064     my($variable,$regdata,$regression,$opt) = @_;
1065     my @X = @{$regdata->{X}};
1066     # my $splo = $regdata->{"spliced-out"} = []; # maybe can be used to
1067                                                # hold the reference
1068                                                # group
1069     while (@X > 1) {
1070         my $reg = Statistics::Regression->new($variable,\@X);
1071         for my $obs (@{$regdata->{data}}) {
1072             my $y = delete $obs->{Y};
1073             $reg->include($y, $obs);
1074             $obs->{Y} = $y;
1075         }
1076         eval {$reg->theta;
1077               my @e = $reg->standarderrors;
1078               die "found standarderrors == 0" if grep { 0 == $_ } @e;
1079               $reg->rsq;};
1080         if ($@) {
1081             if ($opt->{verbose} && $opt->{verbose}>=2) {
1082                 require YAML::Syck;
1083                 warn YAML::Syck::Dump
1084                     ({error=>"could not determine some regression parameters",
1085                       variable=>$variable,
1086                       k=>$reg->k,
1087                       n=>$reg->n,
1088                       X=>$regdata->{"X"},
1089                       errorstr => $@,
1090                      });
1091             }
1092             # reduce k in case that linear dependencies disturbed us;
1093             # often called reference group; I'm tempted to collect and
1094             # make visible
1095             splice @X, 1, 1;
1096         } else {
1097             # $reg->print;
1098             push @$regression, $reg;
1099             return;
1100         }
1101     }
1102 }
1103
1104 =head1 AUTHOR
1105
1106 Andreas König
1107
1108 =head1 BUGS
1109
1110 Please report any bugs or feature requests through the web
1111 interface at
1112 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CPAN-Testers-ParseReport>.
1113 I will be notified, and then you'll automatically be notified of
1114 progress on your bug as I make changes.
1115
1116 =head1 SUPPORT
1117
1118 You can find documentation for this module with the perldoc command.
1119
1120     perldoc CPAN::Testers::ParseReport
1121
1122
1123 You can also look for information at:
1124
1125 =over 4
1126
1127 =item * RT: CPAN's request tracker
1128
1129 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=CPAN-Testers-ParseReport>
1130
1131 =item * AnnoCPAN: Annotated CPAN documentation
1132
1133 L<http://annocpan.org/dist/CPAN-Testers-ParseReport>
1134
1135 =item * CPAN Ratings
1136
1137 L<http://cpanratings.perl.org/d/CPAN-Testers-ParseReport>
1138
1139 =item * Search CPAN
1140
1141 L<http://search.cpan.org/dist/CPAN-Testers-ParseReport>
1142
1143 =back
1144
1145
1146 =head1 ACKNOWLEDGEMENTS
1147
1148 Thanks to RJBS for module-starter.
1149
1150 =head1 COPYRIGHT & LICENSE
1151
1152 Copyright 2008 Andreas König.
1153
1154 This program is free software; you can redistribute it and/or modify it
1155 under the same terms as Perl itself.
1156
1157
1158 =cut
1159
1160 1; # End of CPAN::Testers::ParseReport