lib/CPAN/Testers/ParseReport.pm

   1 package CPAN::Testers::ParseReport;
   2
   3 use warnings;
   4 use strict;
   5
   6 use Config::Perl::V ();
   7 use DateTime::Format::Strptime;
   8 use File::Basename qw(basename);
   9 use File::Path qw(mkpath);
  10 use HTML::Entities qw(decode_entities);
  11 use LWP::UserAgent;
  12 use List::Util qw(max min sum);
  13 use MIME::QuotedPrint ();
  14 use Time::Local ();
  15 use XML::LibXML;
  16 use XML::LibXML::XPathContext;
  17
  18 our $default_ctformat = "yaml";
  19 our $default_transport = "http_cpantesters";
  20 our $default_cturl = "http://www.cpantesters.org/show";
  21 our $Signal = 0;
  22
  23 =encoding utf-8
  24
  25 =head1 NAME
  26
  27 CPAN::Testers::ParseReport - parse reports to www.cpantesters.org from various sources
  28
  29 =cut
  30
  31 use version; our $VERSION = qv('0.1.15');
  32
  33 =head1 SYNOPSIS
  34
  35 The documentation in here is normally not needed because the code is
  36 meant to be run from the standalone program C<ctgetreports>.
  37
  38   ctgetreports --q mod:Moose Devel-Events
  39
  40 =head1 DESCRIPTION
  41
  42 This is the core module for CPAN::Testers::ParseReport. If you're not
  43 looking to extend or alter the behaviour of this module, you probably
  44 want to look at L<ctgetreports> instead.
  45
  46 =head1 OPTIONS
  47
  48 Options are described in the L<ctgetreports> manpage and are passed
  49 through to the functions unaltered.
  50
  51 =head1 FUNCTIONS
  52
  53 =head2 parse_distro($distro,%options)
  54
  55 reads the cpantesters HTML page or the YAML file or the local database
  56 for the distro and loops through the reports for the specified or most
  57 recent version of that distro found in these data.
  58
  59 parse_distro() intentionally has no meaningful return value, different
  60 options would require different ones.
  61
  62 =head2 $extract = parse_single_report($report,$dumpvars,%options)
  63
  64 mirrors and reads this report. $report is of the form
  65
  66   { id => number }
  67
  68 $dumpvar is a hashreference that gets filled with data.
  69
  70 $extract is the result of parse_report() described below.
  71
  72 =cut
  73
  74 {
  75     my $ua;
  76     sub _ua {
  77         return $ua if $ua;
  78         $ua = LWP::UserAgent->new
  79             (
  80              keep_alive => 1,
  81              env_proxy => 1,
  82             );
  83         $ua->parse_head(0);
  84
  85         # I would love to support gzipped transfer but it doesn't seem
  86         # to mix well with mirroring:
  87
  88         # $ua->default_header('Accept-Encoding' => scalar HTTP::Message::decodable());
  89
  90         $ua;
  91     }
  92 }
  93
  94 {
  95     my $xp;
  96     sub _xp {
  97         return $xp if $xp;
  98         $xp = XML::LibXML->new;
  99         $xp->keep_blanks(0);
 100         $xp->clean_namespaces(1);
 101         my $catalog = __FILE__;
 102         $catalog =~ s|ParseReport.pm$|ParseReport/catalog|;
 103         $xp->load_catalog($catalog);
 104         return $xp;
 105     }
 106 }
 107
 108 sub _download_overview {
 109     my($cts_dir, $distro, %Opt) = @_;
 110     my $format = $Opt{ctformat} ||= $default_ctformat;
 111     my $cturl = $Opt{cturl} ||= $default_cturl;
 112     my $ctarget = "$cts_dir/$distro.$format";
 113     my $cheaders = "$cts_dir/$distro.headers";
 114     if ($Opt{local}) {
 115         unless (-e $ctarget) {
 116             die "Alert: No local file '$ctarget' found, cannot continue\n";
 117         }
 118     } else {
 119         if (! -e $ctarget or -M $ctarget > .25) {
 120             if (-e $ctarget && $Opt{verbose}) {
 121                 my(@stat) = stat _;
 122                 my $timestamp = gmtime $stat[9];
 123                 print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
 124             }
 125             print STDERR "Fetching $ctarget..." if $Opt{verbose} && !$Opt{quiet};
 126             my $uri = "$cturl/$distro.$format";
 127             my $resp = _ua->mirror($uri,$ctarget);
 128             if ($resp->is_success) {
 129                 print STDERR "DONE\n" if $Opt{verbose} && !$Opt{quiet};
 130                 open my $fh, ">", $cheaders or die;
 131                 for ($resp->headers->as_string) {
 132                     print $fh $_;
 133                     if ($Opt{verbose} && $Opt{verbose}>1) {
 134                         print STDERR $_ unless $Opt{quiet};
 135                     }
 136                 }
 137             } elsif (304 == $resp->code) {
 138                 print STDERR "DONE (not modified)\n" if $Opt{verbose} && !$Opt{quiet};
 139                 my $atime = my $mtime = time;
 140                 utime $atime, $mtime, $cheaders;
 141             } else {
 142                 die sprintf
 143                     (
 144                      "No success downloading %s: %s",
 145                      $uri,
 146                      $resp->status_line,
 147                     );
 148             }
 149         }
 150     }
 151     return $ctarget;
 152 }
 153
 154 sub _parse_html {
 155     my($ctarget, %Opt) = @_;
 156     my $content = do { open my $fh, $ctarget or die; local $/; <$fh> };
 157     my $preprocesswithtreebuilder = 0; # not needed since barbie switched to XHTML
 158     if ($preprocesswithtreebuilder) {
 159         require HTML::TreeBuilder;
 160         my $tree = HTML::TreeBuilder->new;
 161         $tree->implicit_tags(1);
 162         $tree->p_strict(1);
 163         $tree->ignore_ignorable_whitespace(0);
 164         $tree->parse_content($content);
 165         $tree->eof;
 166         $content = $tree->as_XML;
 167     }
 168     my $parser = _xp();
 169     my $doc = eval { $parser->parse_string($content) };
 170     my $err = $@;
 171     unless ($doc) {
 172         my $distro = basename $ctarget;
 173         die sprintf "Error while parsing %s\: %s", $distro, $err;
 174     }
 175     my $xc = XML::LibXML::XPathContext->new($doc);
 176     my $nsu = $doc->documentElement->namespaceURI;
 177     $xc->registerNs('x', $nsu) if $nsu;
 178     my($selected_release_ul,$selected_release_distrov,$excuse_string);
 179     my($cparentdiv)
 180         = $nsu ?
 181             $xc->findnodes("/x:html/x:body/x:div[\@id = 'doc']") :
 182                 $doc->findnodes("/html/body/div[\@id = 'doc']");
 183     my(@releasedivs) = $nsu ?
 184         $xc->findnodes("//x:div[x:h2 and x:ul]",$cparentdiv) :
 185             $cparentdiv->findnodes("//div[h2 and ul]");
 186     my $releasediv;
 187     if ($Opt{vdistro}) {
 188         $excuse_string = "selected distro '$Opt{vdistro}'";
 189         my($fallbacktoversion) = $Opt{vdistro} =~ /(\d+\..*)/;
 190         $fallbacktoversion = 0 unless defined $fallbacktoversion;
 191       RELEASE: for my $i (0..$#releasedivs) {
 192             my $picked = "";
 193             my($x) = $nsu ?
 194                 $xc->findvalue("x:h2/x:a[2]/\@name",$releasedivs[$i]) :
 195                     $releasedivs[$i]->findvalue("h2/a[2]/\@name");
 196             if ($x) {
 197                 if ($x eq $Opt{vdistro}) {
 198                     $releasediv = $i;
 199                     $picked = " (picked)";
 200                 }
 201                 print STDERR "FOUND DISTRO: $x$picked\n" unless $Opt{quiet};
 202             } else {
 203                 ($x) = $nsu ?
 204                     $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$i]) :
 205                         $releasedivs[$i]->findvalue("h2/a[1]/\@name");
 206                 if ($x eq $fallbacktoversion) {
 207                     $releasediv = $i;
 208                     $picked = " (picked)";
 209                 }
 210                 print STDERR "FOUND VERSION: $x$picked\n" unless $Opt{quiet};
 211             }
 212         }
 213     } else {
 214         $excuse_string = "any distro";
 215     }
 216     unless (defined $releasediv) {
 217         $releasediv = 0;
 218     }
 219     # using a[1] because a[2] is missing on the first entry
 220     ($selected_release_distrov) = $nsu ?
 221         $xc->findvalue("x:h2/x:a[1]/\@name",$releasedivs[$releasediv]) :
 222             $releasedivs[$releasediv]->findvalue("h2/a[1]/\@name");
 223     ($selected_release_ul) = $nsu ?
 224         $xc->findnodes("x:ul",$releasedivs[$releasediv]) :
 225             $releasedivs[$releasediv]->findnodes("ul");
 226     unless (defined $selected_release_distrov) {
 227         warn "Warning: could not find $excuse_string in '$ctarget'";
 228         return;
 229     }
 230     print STDERR "SELECTED: $selected_release_distrov\n" unless $Opt{quiet};
 231     my($id);
 232     my @all;
 233     for my $test ($nsu ?
 234                   $xc->findnodes("x:li",$selected_release_ul) :
 235                   $selected_release_ul->findnodes("li")) {
 236         $id = $nsu ?
 237             $xc->findvalue("x:a[1]/text()",$test)     :
 238                 $test->findvalue("a[1]/text()");
 239         push @all, {id=>$id};
 240         return if $Signal;
 241     }
 242     return \@all;
 243 }
 244
 245 sub _parse_yaml {
 246     my($ctarget, %Opt) = @_;
 247     require YAML::Syck;
 248     my $arr = YAML::Syck::LoadFile($ctarget);
 249     my($selected_release_ul,$selected_release_distrov,$excuse_string);
 250     if ($Opt{vdistro}) {
 251         $excuse_string = "selected distro '$Opt{vdistro}'";
 252         $arr = [grep {$_->{distversion} eq $Opt{vdistro}} @$arr];
 253         ($selected_release_distrov) = $arr->[0]{distversion};
 254     } else {
 255         $excuse_string = "any distro";
 256         my $last_addition;
 257         my %seen;
 258         for my $report (sort { $a->{id} <=> $b->{id} } @$arr) {
 259             unless ($seen{$report->{distversion}}++) {
 260                 $last_addition = $report->{distversion};
 261             }
 262         }
 263         $arr = [grep {$_->{distversion} eq $last_addition} @$arr];
 264         ($selected_release_distrov) = $last_addition;
 265     }
 266     unless ($selected_release_distrov) {
 267         warn "Warning: could not find $excuse_string in '$ctarget'";
 268         return;
 269     }
 270     print STDERR "SELECTED: $selected_release_distrov\n" unless $Opt{quiet};
 271     my @all;
 272     for my $test (@$arr) {
 273         my $id = $test->{id};
 274         push @all, {id=>$id};
 275         return if $Signal;
 276     }
 277     @all = sort { $b->{id} <=> $a->{id} } @all;
 278     return \@all;
 279 }
 280
 281 sub parse_single_report {
 282     my($report, $dumpvars, %Opt) = @_;
 283     my($id) = $report->{id};
 284     $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
 285     my $nnt_dir = "$Opt{cachedir}/nntp-testers";
 286     mkpath $nnt_dir;
 287     my $target = "$nnt_dir/$id";
 288     if ($Opt{local}) {
 289         unless (-e $target) {
 290             die {severity=>0,text=>"Warning: No local file '$target' found, skipping\n"};
 291         }
 292     } else {
 293         if (! -e $target) {
 294             print STDERR "Fetching $target..." if $Opt{verbose} && !$Opt{quiet};
 295             $Opt{transport} ||= $default_transport;
 296             if (0) {
 297             } elsif ($Opt{transport} eq "http_cpantesters") {
 298                 my $resp = _ua->mirror("http://www.cpantesters.org/cgi-bin/pages.cgi?act=cpan-report&raw=1&id=$id",$target);
 299                 if ($resp->is_success) {
 300                     if ($Opt{verbose}) {
 301                         my(@stat) = stat $target;
 302                         my $timestamp = gmtime $stat[9];
 303                         print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
 304                         if ($Opt{verbose} > 1) {
 305                             print STDERR $resp->headers->as_string unless $Opt{quiet};
 306                         }
 307                     }
 308                     my $headers = "$target.headers";
 309                     open my $fh, ">", $headers or die {severity=>1,text=>"Could not open >$headers: $!"};
 310                     print $fh $resp->headers->as_string;
 311                 } else {
 312                     die {severity=>0,
 313                              text=>sprintf "HTTP Server Error[%s] for id[%s]", $resp->status_line, $id};
 314                 }
 315             } elsif ($Opt{transport} eq "http_cpantesters_gzip") {
 316                 if (-e "$target.gz") {
 317                     0 == system gunzip => $target or die;
 318                 }
 319                 my $resp = _ua->mirror("http://www.cpantesters.org/cgi-bin/pages.cgi?act=cpan-report&raw=1&id=$id",$target);
 320                 if ($resp->is_success) {
 321                     if ($Opt{verbose}) {
 322                         my(@stat) = stat $target;
 323                         my $timestamp = gmtime $stat[9];
 324                         print STDERR "(timestamp $timestamp GMT)\n" unless $Opt{quiet};
 325                         if ($Opt{verbose} > 1) {
 326                             print STDERR $resp->headers->as_string unless $Opt{quiet};
 327                         }
 328                     }
 329                     my $headers = "$target.headers";
 330                     open my $fh, ">", $headers or die {severity=>1,text=>"Could not open >$headers: $!"};
 331                     print $fh $resp->headers->as_string;
 332                     0 == system gzip => $target or die;
 333                 } else {
 334                     die {severity=>0,
 335                              text=>sprintf "HTTP Server Error[%s] for id[%s]", $resp->status_line, $id};
 336                 }
 337             } else {
 338                 die {severity=>1,text=>"Illegal value for --transport: '$Opt{transport}'"};
 339             }
 340         }
 341     }
 342     parse_report($target, $dumpvars, %Opt);
 343 }
 344
 345 sub parse_distro {
 346     my($distro,%Opt) = @_;
 347     my %dumpvars;
 348     $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
 349     my $cts_dir = "$Opt{cachedir}/cpantesters-show";
 350     mkpath $cts_dir;
 351     if ($Opt{solve}) {
 352         require Statistics::Regression;
 353         $Opt{dumpvars} = "." unless defined $Opt{dumpvars};
 354     }
 355     if (!$Opt{vdistro} && $distro =~ /^(.+)-(?i:v?\d+)(?:\.\d+)*\w*$/) {
 356         $Opt{vdistro} = $distro;
 357         $distro = $1;
 358     }
 359     my $reports;
 360     if (my $ctdb = $Opt{ctdb}) {
 361         require CPAN::WWW::Testers::Generator::Database;
 362         require CPAN::DistnameInfo;
 363         my $dbi = CPAN::WWW::Testers::Generator::Database->new(database=>$ctdb) or die "Alert: unknown error while opening database '$ctdb'";
 364         unless ($Opt{vdistro}) {
 365             my $sql = "select version from cpanstats where dist=? order by id";
 366             my @rows = $dbi->get_query($sql,$distro);
 367             my($newest,%seen);
 368             for my $row (@rows) {
 369                 $newest = $row->[0] unless $seen{$row->[0]}++;
 370             }
 371             $Opt{vdistro} = "$distro-$newest";
 372         }
 373         my $d = CPAN::DistnameInfo->new("FOO/$Opt{vdistro}.tgz");
 374         my $dist = $d->dist;
 375         my $version = $d->version;
 376         my $sql = "select id from cpanstats where dist=? and version=? order by id desc";
 377         my @rows = $dbi->get_query($sql,$dist,$version);
 378         my @all;
 379         for my $row (@rows) {
 380             my $id = $row->[0];
 381             push @all, {id=>$id};
 382         }
 383         $reports = \@all;
 384     } else {
 385         my $ctarget = _download_overview($cts_dir, $distro, %Opt);
 386         $Opt{ctformat} ||= $default_ctformat;
 387         if ($Opt{ctformat} eq "html") {
 388             $reports = _parse_html($ctarget,%Opt);
 389         } else {
 390             $reports = _parse_yaml($ctarget,%Opt);
 391         }
 392     }
 393     return unless $reports;
 394     my $sampled = 0;
 395     my $i = 0;
 396     my $samplesize = $Opt{sample} || 0;
 397     $samplesize = 0 if $samplesize && $samplesize >= @$reports;
 398  REPORT: for my $report (@$reports) {
 399         $i++;
 400         if ($samplesize) {
 401             my $need = $samplesize - $sampled;
 402             next REPORT unless $need;
 403             my $left = @$reports - $i;
 404             # warn sprintf "tot %d i %d sampled %d need %d left %d\n", scalar @$reports, $i, $sampled, $need, $left;
 405             my $want_this = (rand(1) <= ($need/$left));
 406             next REPORT unless $want_this;
 407         }
 408         eval {parse_single_report($report, \%dumpvars, %Opt)};
 409         if ($@) {
 410             if (ref $@) {
 411                 if ($@->{severity}) {
 412                     die $@->{text};
 413                 } else {
 414                     warn $@->{text};
 415                 }
 416             } else {
 417                 die $@;
 418             }
 419         }
 420         $sampled++;
 421         last if $Signal;
 422     }
 423     if ($Opt{dumpvars}) {
 424         require YAML::Syck;
 425         my $dumpfile = $Opt{dumpfile} || "ctgetreports.out";
 426         open my $fh, ">", $dumpfile or die "Could not open '$dumpfile' for writing: $!";
 427         print $fh YAML::Syck::Dump(\%dumpvars);
 428         close $fh or die "Could not close '$dumpfile': $!"
 429     }
 430     if ($Opt{solve}) {
 431         solve(\%dumpvars,%Opt);
 432     }
 433 }
 434
 435 =head2 $bool = _looks_like_qp($raw_report)
 436
 437 We had to acknowledge the fact that some MTAs swallow the MIME-Version
 438 header while passing MIME through. So we introduce fallback heuristics
 439 that try to determine if a report is written in quoted printable.
 440
 441 Note that this subroutine is internal, just documented to have the
 442 internals documented.
 443
 444 The current implementation counts the number of QP escaped spaces and
 445 equal signs.
 446
 447 =cut
 448
 449 sub _looks_like_qp {
 450     my($report) = @_;
 451     my $count_space = () = $report =~ /=20/g;
 452     return 1 if $count_space > 12;
 453     my $count_equal = () = $report =~ /=3D/g;
 454     return 1 if $count_equal > 12;
 455     return 1 if $count_space+$count_equal > 24;
 456     return 0; # waiting for a counter example
 457 }
 458
 459 =head2 $extract = parse_report($target,$dumpvars,%Opt)
 460
 461 Reads one report. $target is the local filename to read. $dumpvars is
 462 a hashref which gets filled with descriptive stats about
 463 PASS/FAIL/etc. %Opt are the options as described in the
 464 C<ctgetreports> manpage. $extract is a hashref containing the found
 465 variables.
 466
 467 Note: this parsing is a bit dirty but as it seems good enough I'm not
 468 inclined to change it. We parse HTML with regexps only, not an HTML
 469 parser. Only the entities are decoded.
 470
 471 In %Opt you can use
 472
 473     article => $some_full_article_as_scalar
 474
 475 to use this function to parse one full article as text. When this is
 476 given, the argument $target is not read, but its basename is taken to
 477 be the id of the article. (OMG, hackers!)
 478
 479 =cut
 480 sub parse_report {
 481     my($target,$dumpvars,%Opt) = @_;
 482     our @q;
 483     my $id = basename($target);
 484     # warn "DEBUG: id[$id]";
 485     my($ok,$about);
 486
 487     my(%extract);
 488
 489     my($report,$isHTML) = _get_cooked_report($target, \%Opt);
 490     my @qr = map /^qr:(.+)/, @{$Opt{q}};
 491     if ($Opt{raw} || @qr) {
 492         for my $qr (@qr) {
 493             my $cqr = eval "qr{$qr}";
 494             die "Could not compile regular expression '$qr': $@" if $@;
 495             my(@matches) = $report =~ $cqr;
 496             my $v;
 497             if (@matches) {
 498                 if (@matches==1) {
 499                     $v = $matches[0];
 500                 } else {
 501                     $v = join "", map {"($_)"} @matches;
 502                 }
 503             } else {
 504                 $v = "";
 505             }
 506             $extract{"qr:$qr"} = $v;
 507         }
 508     }
 509
 510     my $report_writer;
 511     my $moduleunpack = {};
 512     my $expect_prereq = 0;
 513     my $expect_toolchain = 0;
 514     my $expecting_toolchain_soon = 0;
 515     my $fallback_p5 = "";
 516
 517     my $in_summary = 0;
 518     my $in_summary_seen_platform = 0;
 519     my $in_prg_output = 0;
 520     my $in_env_context = 0;
 521
 522     my $current_headline;
 523     my @previous_line = ""; # so we can neutralize line breaks
 524     my @rlines = split /\r?\n/, $report;
 525  LINE: for (@rlines) {
 526         next LINE unless ($isHTML ? m/<title>((\S+)\s+(\S+))/ : m/^Subject:\s*((\S+)\s+(\S+))/);
 527         my $s = $1;
 528         $s = $1 if $s =~ m{<strong>(.+)};
 529         if ($s =~ /(\S+)\s+(\S+)/) {
 530             $ok = $1;
 531             $about = $2;
 532         }
 533         $extract{"meta:ok"}    = $ok;
 534         $extract{"meta:about"} = $about;
 535         last;
 536     }
 537     unless ($extract{"meta:about"}) {
 538         $extract{"meta:about"} = $Opt{vdistro};
 539         unless ($extract{"meta:ok"}) {
 540             $DB::single++;
 541             warn "Warning: could not determine state of report";
 542         }
 543     }
 544  LINE: while (@rlines) {
 545         $_ = shift @rlines;
 546         while (/!$/ and @rlines) {
 547             my $followupline = shift @rlines;
 548             $followupline =~ s/^\s+//; # remo leading space
 549             $_ .= $followupline;
 550         }
 551         if (/^--------/ && $previous_line[-2] && $previous_line[-2] =~ /^--------/) {
 552             $current_headline = $previous_line[-1];
 553             if ($current_headline =~ /PROGRAM OUTPUT/) {
 554                 $in_prg_output = 1;
 555             } else {
 556                 $in_prg_output = 0;
 557             }
 558             if ($current_headline =~ /ENVIRONMENT AND OTHER CONTEXT/) {
 559                 $in_env_context = 1;
 560             } else {
 561                 $in_env_context = 0;
 562             }
 563         }
 564         if ($extract{"meta:perl"}) {
 565             if (    $in_summary
 566                 and !$extract{"conf:git_commit_id"}
 567                 and /Commit id:\s*([[:xdigit:]]+)/) {
 568                 $extract{"conf:git_commit_id"} = $1;
 569             }
 570         } else {
 571             my $p5;
 572             if (0) {
 573             } elsif (/Summary of my perl5 \((.+)\) configuration:/) {
 574                 $p5 = $1;
 575                 $in_summary = 1;
 576                 $in_env_context = 0;
 577             }
 578             if ($p5) {
 579                 my($r,$v,$s,$p);
 580                 if (($r,$v,$s,$p) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+) patch (\S+)/) {
 581                     $r =~ s/\.0//; # 5.0 6 2!
 582                     $extract{"meta:perl"} = "$r.$v.$s\@$p";
 583                 } elsif (($r,$v,$s) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+)/) {
 584                     $r =~ s/\.0//;
 585                     $extract{"meta:perl"} = "$r.$v.$s";
 586                 } elsif (($r,$v,$s) = $p5 =~ /(\d+\S*) patchlevel (\S+) subversion (\S+)/) {
 587                     $r =~ s/\.0//;
 588                     $extract{"meta:perl"} = "$r.$v.$s";
 589                 } else {
 590                     $extract{"meta:perl"} = $p5;
 591                 }
 592             }
 593         }
 594         unless ($extract{"meta:from"}) {
 595             if (0) {
 596             } elsif ($isHTML ?
 597                      m|<div class="h_name">From:</div> <b>(.+?)</b><br/>| :
 598                      m|^From:\s*(.+)|
 599                     ) {
 600                 my $f = $1;
 601                 $f = $1 if $f =~ m{<strong>(.+)</strong>};
 602                 $extract{"meta:from"} = $f;
 603             }
 604             $extract{"meta:from"} =~ s/\.$// if $extract{"meta:from"};
 605         }
 606         unless ($extract{"meta:date"}) {
 607             if (0) {
 608             } elsif ($isHTML ?
 609                      m|<div class="h_name">Date:</div> (.+?)<br/>| :
 610                      m|^Date:\s*(.+)|
 611                     ) {
 612                 my $date = $1;
 613                 $date = $1 if $date =~ m{<strong>(.+)</strong>};
 614                 my($dt);
 615             DATEFMT: for my $pat ("%Y-%m-%dT%TZ", # 2010-07-07T14:01:40Z
 616                                   "%a, %d %b %Y %T %z", # Sun, 28 Sep 2008 12:23:12 +0100
 617                                   "%b %d, %Y %R", # July 10,...
 618                                   "%b  %d, %Y %R", # July  4,...
 619                                  ) {
 620                     $dt = eval {
 621                         my $p = DateTime::Format::Strptime->new
 622                             (
 623                              locale => "en",
 624                              time_zone => "UTC",
 625                              pattern => $pat,
 626                             );
 627                         $p->parse_datetime($date)
 628                     };
 629                     last DATEFMT if $dt;
 630                 }
 631                 unless ($dt) {
 632                     warn "Could not parse date[$date], setting to epoch 0";
 633                     $dt = DateTime->from_epoch( epoch => 0 );
 634                 }
 635                 $extract{"meta:date"} = $dt->datetime;
 636             }
 637             $extract{"meta:date"} =~ s/\.$// if $extract{"meta:date"};
 638         }
 639         unless ($extract{"meta:writer"}) {
 640             for ("$previous_line[-1] $_") {
 641                 if (0) {
 642                 } elsif (/CPANPLUS, version (\S+)/) {
 643                     $extract{"meta:writer"} = "CPANPLUS $1";
 644                 } elsif (/created (?:automatically )?by (\S+)/) {
 645                     $extract{"meta:writer"} = $1;
 646                     if (/\s+on\s+perl\s+([^,]+),/) {
 647                         $fallback_p5 = $1;
 648                     }
 649                 } elsif (/This report was machine-generated by (\S+) (\S+)/) {
 650                     $extract{"meta:writer"} = "$1 $2";
 651                 }
 652                 $extract{"meta:writer"} =~ s/[\.,]$// if $extract{"meta:writer"};
 653             }
 654         }
 655         if ($in_summary) {
 656             # we do that first three lines a bit too often
 657             my $qr = $Opt{dumpvars} || "";
 658             $qr = qr/$qr/ if $qr;
 659             unless (@q) {
 660                 @q = @{$Opt{q}||[]};
 661                 @q = qw(meta:perl conf:archname conf:usethreads conf:optimize meta:writer meta:from) unless @q;
 662             }
 663
 664             my %conf_vars = map {($_ => 1)} grep { /^conf:/ } @q;
 665
 666             if (/^\s+Platform:$/) {
 667                 $in_summary_seen_platform=1;
 668             } elsif (/^\s*$/ || m|</pre>|) {
 669                 # if not html, we have reached the end now
 670                 if ($in_summary_seen_platform) {
 671                     # some perls have an empty line after the summary line
 672                     $in_summary = 0;
 673                 }
 674             } else {
 675                 my(%kv) = /\G,?\s*([^=]+)=('[^']+?'|\S+)/gc;
 676                 while (my($k,$v) = each %kv) {
 677                     my $ck = "conf:$k";
 678                     $ck =~ s/\s+$//;
 679                     $v =~ s/,$//;
 680                     if ($v =~ /^'(.*)'$/) {
 681                         $v = $1;
 682                     }
 683                     $v =~ s/^\s+//;
 684                     $v =~ s/\s+$//;
 685                     if ($qr && $ck =~ $qr) {
 686                         $extract{$ck} = $v;
 687                     } elsif ($conf_vars{$ck}) {
 688                         $extract{$ck} = $v;
 689                     }
 690                 }
 691             }
 692         }
 693         if ($in_prg_output) {
 694             unless ($extract{"meta:output_from"}) {
 695                 if (/Output from (.+):$/) {
 696                     $extract{"meta:output_from"} = $1
 697                 }
 698             }
 699         }
 700         if ($in_env_context) {
 701             if (/^\s{4}(\S+)\s*=\s*(.*)$/) {
 702                 $extract{"env:$1"} = $2;
 703             }
 704         }
 705         push @previous_line, $_;
 706         if ($expect_prereq || $expect_toolchain) {
 707             if (/Perl module toolchain versions installed/) {
 708                 # first time discovered in CPANPLUS 0.89_06
 709                 $expecting_toolchain_soon = 1;
 710                 $expect_prereq=0;
 711                 next LINE;
 712             }
 713             if (exists $moduleunpack->{type}) {
 714                 my($module,$v,$needwant);
 715                 # type 1 and 2 are about prereqs, type three about toolchain
 716                 if ($moduleunpack->{type} == 1) {
 717                     (my $leader,$module,$needwant,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
 718                     next LINE if $@;
 719                     if ($leader =~ /^-/) {
 720                         $moduleunpack = {};
 721                         $expect_prereq = 0;
 722                         next LINE;
 723                     } elsif ($leader =~ /^(
 724                                          buil          # build_requires:
 725                                          |conf         # configure_requires:
 726                                         )/x) {
 727                         next LINE;
 728                     } elsif ($module =~ /^(
 729                                          -             # line drawing
 730                                         )/x) {
 731                         next LINE;
 732                     }
 733                 } elsif ($moduleunpack->{type} == 2) {
 734                     (my $leader,$module,$v,$needwant) = eval { unpack $moduleunpack->{tpl}, $_; };
 735                     next LINE if $@;
 736                     for ($module,$v,$needwant) {
 737                         s/^\s+//;
 738                         s/\s+$//;
 739                     }
 740                     if ($leader =~ /^\*/) {
 741                         $moduleunpack = {};
 742                         $expect_prereq = 0;
 743                         next LINE;
 744                     } elsif (!defined $v
 745                              or !defined $needwant
 746                              or $v =~ /\s/
 747                              or $needwant =~ /\s/
 748                             ) {
 749                         ($module,$v,$needwant) = split " ", $_;
 750                     }
 751                 } elsif ($moduleunpack->{type} == 3) {
 752                     (my $leader,$module,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
 753                     next LINE if $@;
 754                     if (!$module) {
 755                         $moduleunpack = {};
 756                         $expect_toolchain = 0;
 757                         next LINE;
 758                     } elsif ($module =~ /^-/) {
 759                         next LINE;
 760                     }
 761                 }
 762                 $module =~ s/\s+$//;
 763                 if ($module) {
 764                     $v =~ s/^\s+//;
 765                     $v =~ s/\s+$//;
 766                     my($modulename,$versionlead) = split " ", $module;
 767                     if (defined $modulename and defined $versionlead) {
 768                         $module = $modulename;
 769                         $v = "$versionlead$v";
 770                     }
 771                     if ($v eq "Have") {
 772                         next LINE;
 773                     }
 774                     $extract{"mod:$module"} = $v;
 775                     if ($needwant) {
 776                         $needwant =~ s/^\s+//;
 777                         $needwant =~ s/\s+$//;
 778                         $extract{"prereq:$module"} = $needwant;
 779                     }
 780                 }
 781             }
 782             if (/(\s+)(Module\s+)(Need\s+)Have/) {
 783                 $in_env_context = 0;
 784                 $moduleunpack = {
 785                                  tpl => 'a'.length($1).'a'.length($2).'a'.length($3).'a*',
 786                                  type => 1,
 787                                 };
 788             } elsif (/(\s+)(Module Name\s+)(Have)(\s+)Want/) {
 789                 $in_env_context = 0;
 790                 my $adjust_1 = 0;
 791                 my $adjust_2 = -length($4);
 792                 my $adjust_3 = length($4);
 793                 # I think they do not really try to align, usually we
 794                 # get away with split
 795                 $moduleunpack = {
 796                                  tpl => 'a'.length($1).'a'.(length($2)+$adjust_2).'a'.(length($3)+$adjust_3).'a*',
 797                                  type => 2,
 798                                 };
 799             }
 800         }
 801         if (/PREREQUISITES|Prerequisite modules loaded/) {
 802             $in_env_context = 0;
 803             $expect_prereq=1;
 804         }
 805         if ($expecting_toolchain_soon) {
 806             if (/(\s+)(Module(?:\sName)?\s+) Have/) {
 807                 $in_env_context = 0;
 808                 $expect_toolchain=1;
 809                 $expecting_toolchain_soon=0;
 810                 $moduleunpack = {
 811                                  tpl => 'a'.length($1).'a'.length($2).'a*',
 812                                  type => 3,
 813                                 };
 814             }
 815         }
 816         if (/toolchain versions installed/) {
 817             $in_env_context = 0;
 818             $expecting_toolchain_soon=1;
 819         }
 820     }                           # LINE
 821     if (! $extract{"meta:perl"} && $fallback_p5) {
 822         my($p5,$patch) = split /\s+patch\s+/, $fallback_p5;
 823         $extract{"meta:perl"} = $p5;
 824         $extract{"conf:git_describe"} = $patch if defined $patch;
 825     }
 826     $extract{id} = $id;
 827     if (my $filtercbbody = $Opt{filtercb}) {
 828         my $filtercb = eval('sub {'.$filtercbbody.'}');
 829         $filtercb->(\%extract);
 830     }
 831     if ($Opt{solve}) {
 832         if ($extract{"conf:osvers"} && $extract{"conf:archname"}) {
 833             $extract{"conf:archname+osvers"} = join " ", @extract{"conf:archname","conf:osvers"};
 834         }
 835         my $data = $dumpvars->{"==DATA=="} ||= [];
 836         push @$data, \%extract;
 837     }
 838     # ---- %extract finished ----
 839     my $diag = "";
 840     if (my $qr = $Opt{dumpvars}) {
 841         $qr = qr/$qr/;
 842         while (my($k,$v) = each %extract) {
 843             if ($k =~ $qr) {
 844                 $dumpvars->{$k}{$v}{$extract{"meta:ok"}}++;
 845             }
 846         }
 847     }
 848     for my $want (@q) {
 849         my $have  = $extract{$want} || "";
 850         $diag .= " $want\[$have]";
 851     }
 852     printf STDERR " %-4s %8d%s\n", $extract{"meta:ok"}, $id, $diag unless $Opt{quiet};
 853     if ($Opt{raw}) {
 854         $report =~ s/\s+\z//;
 855         print STDERR $report, "\n================\n" unless $Opt{quiet};
 856     }
 857     if ($Opt{interactive}) {
 858         require IO::Prompt;
 859         local @ARGV;
 860         local $ARGV;
 861         my $ans = IO::Prompt::prompt
 862             (
 863              -p => "View $id? [onechar: ynq] ",
 864              -d => "y",
 865              -u => qr/[ynq]/,
 866              -onechar,
 867             );
 868         print STDERR "\n" unless $Opt{quiet};
 869         if ($ans eq "y") {
 870             my($report) = _get_cooked_report($target, \%Opt);
 871             $Opt{pager} ||= "less";
 872             open my $lfh, "|-", $Opt{pager} or die "Could not fork '$Opt{pager}': $!";
 873             local $/;
 874             print {$lfh} $report;
 875             close $lfh or die "Could not close pager: $!"
 876         } elsif ($ans eq "q") {
 877             $Signal++;
 878             return;
 879         }
 880     }
 881     return \%extract;
 882 }
 883
 884 sub _get_cooked_report {
 885     my($target, $Opt_ref) = @_;
 886     my($report, $isHTML);
 887     if ($report = $Opt_ref->{article}) {
 888         $isHTML = $report =~ /^</;
 889         undef $target;
 890     }
 891     if ($target) {
 892         my $fh;
 893         if (0) {
 894         } elsif (-e $target) {
 895             open $fh, '<', $target or die "Could not open '$target': $!";
 896         } elsif (-e "$target.gz") {
 897             open $fh, "-|", "zcat", $target or die "Could not open '$target.gz': $!";
 898         } else {
 899             die "Could not find '$target' or '$target.gz'";
 900         }
 901         local $/;
 902         my $raw_report = <$fh>;
 903         $isHTML = $raw_report =~ /^</;
 904         if ($isHTML) {
 905             if ($raw_report =~ m{^<\?.+?<html.+?<head.+?<body.+?<pre[^>]*>(.+)</pre>.*</body>.*</html>}s) {
 906                 $raw_report = decode_entities($1);
 907                 $isHTML = 0;
 908             }
 909         }
 910         if ($isHTML) {
 911             $report = decode_entities($raw_report);
 912         } elsif ($raw_report =~ /^MIME-Version: 1.0$/m
 913                  ||
 914                  _looks_like_qp($raw_report)
 915                 ) {
 916             # minimizing MIME effort; don't know about reports in other formats
 917             $report = MIME::QuotedPrint::decode_qp($raw_report);
 918         } else {
 919             $report = $raw_report;
 920         }
 921         close $fh;
 922     }
 923     if ($report =~ /\r\n/) {
 924         my @rlines = split /\r?\n/, $report;
 925         $report = join "\n", @rlines;
 926     }
 927     ($report, $isHTML);
 928 }
 929
 930 =head2 solve
 931
 932 Feeds a couple of potentially interesting data to
 933 Statistics::Regression and sorts the result by R^2 descending. Do not
 934 confuse this with a prove, rather take it as a useful hint. It can
 935 save you minutes of staring at data and provide a quick overview where
 936 one should look closer. Displays the N top candidates, where N
 937 defaults to 3 and can be set with the C<$Opt{solvetop}> variable.
 938 Regressions results with an R^2 of 1.00 are displayed in any case.
 939
 940 The function is called when the option C<-solve> is give on the
 941 commandline. Several extra config variables are calculated, see source
 942 code for details.
 943
 944 =cut
 945 {
 946     my %never_solve_on = map {($_ => 1)}
 947         (
 948          "conf:ccflags",
 949          "conf:config_args",
 950          "conf:cppflags",
 951          "conf:lddlflags",
 952          "conf:uname",
 953          "env:PATH",
 954          "env:PERL5LIB",
 955          "env:PERL5OPT",
 956          'env:$^X',
 957          'env:$EGID',
 958          'env:$GID',
 959          'env:$UID/$EUID',
 960          'env:PERL5_CPANPLUS_IS_RUNNING',
 961          'env:PERL5_CPAN_IS_RUNNING',
 962          'env:PERL5_CPAN_IS_RUNNING_IN_RECURSION',
 963          'meta:ok',
 964         );
 965     my %normalize_numeric =
 966         (
 967          id => sub { return shift },
 968          'meta:date' => sub {
 969              my $v = shift;
 970              my($Y,$M,$D,$h,$m,$s) = $v =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)/;
 971              unless (defined $M) {
 972                  die "illegal value[$v] for a date";
 973              }
 974              Time::Local::timegm($s,$m,$h,$D,$M-1,$Y);
 975          },
 976         );
 977     my %normalize_value =
 978         (
 979          'meta:perl' => sub {
 980              my($perlatpatchlevel) = shift;
 981              my $perl = $perlatpatchlevel;
 982              $perl =~ s/\@.*//;
 983              $perl;
 984          },
 985         );
 986 sub solve {
 987     my($V,%Opt) = @_;
 988     require Statistics::Regression;
 989     my @regression;
 990     my $ycb;
 991     if (my $ycbbody = $Opt{ycb}) {
 992         $ycb = eval('sub {'.$ycbbody.'}');
 993         die if $@;
 994     } else {
 995         $ycb = sub {
 996             my $rec = shift;
 997             my $y;
 998             if ($rec->{"meta:ok"} eq "PASS") {
 999                 $y = 1;
1000             } elsif ($rec->{"meta:ok"} eq "FAIL") {
1001                 $y = 0;
1002             }
1003             return $y
1004         };
1005     }
1006   VAR: for my $variable (sort keys %$V) {
1007         next if $variable eq "==DATA==";
1008         if ($never_solve_on{$variable}){
1009             warn "Skipping '$variable'\n" unless $Opt{quiet};
1010             next VAR;
1011         }
1012         my $value_distribution = $V->{$variable};
1013         my $keys = keys %$value_distribution;
1014         my @X = qw(const);
1015         if ($normalize_numeric{$variable}) {
1016             push @X, "n_$variable";
1017         } else {
1018             my %seen = ();
1019             for my $value (sort keys %$value_distribution) {
1020                 my $pf = $value_distribution->{$value};
1021                 $pf->{PASS} ||= 0;
1022                 $pf->{FAIL} ||= 0;
1023                 if ($pf->{PASS} || $pf->{FAIL}) {
1024                     my $Xele = sprintf "eq_%s",
1025                         (
1026                          $normalize_value{$variable} ?
1027                          $normalize_value{$variable}->($value) :
1028                          $value
1029                         );
1030                     push @X, $Xele unless $seen{$Xele}++;
1031
1032                 }
1033                 if (
1034                     $pf->{PASS} xor $pf->{FAIL}
1035                    ) {
1036                     my $vl = 40;
1037                     substr($value,$vl) = "..." if length $value > 3+$vl;
1038                     my $poor_mans_freehand_estimation = 0;
1039                     if ($poor_mans_freehand_estimation) {
1040                         warn sprintf
1041                             (
1042                              "%4d %4d %-23s | %s\n",
1043                              $pf->{PASS},
1044                              $pf->{FAIL},
1045                              $variable,
1046                              $value,
1047                             );
1048                     }
1049                 }
1050             }
1051         }
1052         warn "variable[$variable]keys[$keys]X[@X]\n" unless $Opt{quiet};
1053         next VAR unless @X > 1;
1054         my %regdata =
1055             (
1056              X => \@X,
1057              data => [],
1058             );
1059       RECORD: for my $rec (@{$V->{"==DATA=="}}) {
1060             my $y = $ycb->($rec);
1061             next RECORD unless defined $y;
1062             my %obs;
1063             $obs{Y} = $y;
1064             @obs{@X} = (0) x @X;
1065             $obs{const} = 1;
1066             for my $x (@X) {
1067                 if ($x =~ /^eq_(.+)/) {
1068                     my $read_v = $1;
1069                     if (exists $rec->{$variable}
1070                         && defined $rec->{$variable}
1071                        ) {
1072                         my $use_v = (
1073                                      $normalize_value{$variable} ?
1074                                      $normalize_value{$variable}->($rec->{$variable}) :
1075                                      $rec->{$variable}
1076                                     );
1077                         if ($use_v eq $read_v) {
1078                             $obs{$x} = 1;
1079                         }
1080                     }
1081                     # warn "DEBUG: y[$y]x[$x]obs[$obs{$x}]\n";
1082                 } elsif ($x =~ /^n_(.+)/) {
1083                     my $v = $1;
1084                     $obs{$x} = eval { $normalize_numeric{$v}->($rec->{$v}); };
1085                     if ($@) {
1086                         warn "Warning: error during parsing v[$v] in record[$rec->{id}]: $@; continuing with undef value";
1087                     }
1088                 }
1089             }
1090             push @{$regdata{data}}, \%obs;
1091         }
1092         _run_regression ($variable, \%regdata, \@regression, \%Opt);
1093     }
1094     my $top = min ($Opt{solvetop} || 3, scalar @regression);
1095     my $max_rsq = sum map {1==$_->rsq ? 1 : 0} @regression;
1096     $top = $max_rsq if $max_rsq && $max_rsq > $top;
1097     my $score = 0;
1098     printf
1099         (
1100          "State after regression testing: %d results, showing top %d\n\n",
1101          scalar @regression,
1102          $top,
1103         );
1104     for my $reg (sort {
1105                      $b->rsq <=> $a->rsq
1106                      ||
1107                      $a->k <=> $b->k
1108                  } @regression) {
1109         printf "(%d)\n", ++$score;
1110         eval { $reg->print; };
1111         if ($@) {
1112             printf "\n\nOops, Statistics::Regression died during ->print() with error message[$@]\n\n";
1113         }
1114         last if --$top <= 0;
1115     }
1116 }
1117 }
1118
1119 # $variable is the name we pass through to S:R constructor
1120 # $regdata is hash and has the arrays "X" and "data" (observations)
1121 # X goes to S:R constructor
1122 # each observation has a Y which we pass to S:R in an include() call
1123 # $regression is the collector array of results
1124 # $opt are the options from outside, used to see if we are "verbose"
1125 sub _run_regression {
1126     my($variable,$regdata,$regression,$opt) = @_;
1127     my @X = @{$regdata->{X}};
1128     # my $splo = $regdata->{"spliced-out"} = []; # maybe can be used to
1129                                                # hold the reference
1130                                                # group
1131     while (@X > 1) {
1132         my $reg = Statistics::Regression->new($variable,\@X);
1133         for my $obs (@{$regdata->{data}}) {
1134             my $y = delete $obs->{Y};
1135             $reg->include($y, $obs);
1136             $obs->{Y} = $y;
1137         }
1138         eval {$reg->theta;
1139               my @e = $reg->standarderrors;
1140               die "found standarderrors == 0" if grep { 0 == $_ } @e;
1141               $reg->rsq;};
1142         if ($@) {
1143             if ($opt->{verbose} && $opt->{verbose}>=2) {
1144                 require YAML::Syck;
1145                 warn YAML::Syck::Dump
1146                     ({error=>"could not determine some regression parameters",
1147                       variable=>$variable,
1148                       k=>$reg->k,
1149                       n=>$reg->n,
1150                       X=>$regdata->{"X"},
1151                       errorstr => $@,
1152                      });
1153             }
1154             # reduce k in case that linear dependencies disturbed us;
1155             # often called reference group; I'm tempted to collect and
1156             # make visible
1157             splice @X, 1, 1;
1158         } else {
1159             # $reg->print;
1160             push @$regression, $reg;
1161             return;
1162         }
1163     }
1164 }
1165
1166 =head1 AUTHOR
1167
1168 Andreas König
1169
1170 =head1 BUGS
1171
1172 Please report any bugs or feature requests through the web
1173 interface at
1174 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CPAN-Testers-ParseReport>.
1175 I will be notified, and then you'll automatically be notified of
1176 progress on your bug as I make changes.
1177
1178 =head1 SUPPORT
1179
1180 You can find documentation for this module with the perldoc command.
1181
1182     perldoc CPAN::Testers::ParseReport
1183
1184
1185 You can also look for information at:
1186
1187 =over 4
1188
1189 =item * RT: CPAN's request tracker
1190
1191 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=CPAN-Testers-ParseReport>
1192
1193 =item * AnnoCPAN: Annotated CPAN documentation
1194
1195 L<http://annocpan.org/dist/CPAN-Testers-ParseReport>
1196
1197 =item * CPAN Ratings
1198
1199 L<http://cpanratings.perl.org/d/CPAN-Testers-ParseReport>
1200
1201 =item * Search CPAN
1202
1203 L<http://search.cpan.org/dist/CPAN-Testers-ParseReport>
1204
1205 =back
1206
1207
1208 =head1 ACKNOWLEDGEMENTS
1209
1210 Thanks to RJBS for module-starter.
1211
1212 =head1 COPYRIGHT & LICENSE
1213
1214 Copyright 2008 Andreas König.
1215
1216 This program is free software; you can redistribute it and/or modify it
1217 under the same terms as Perl itself.
1218
1219
1220 =cut
1221
1222 1; # End of CPAN::Testers::ParseReport