add ack to rjbs; catch a HTML quot and convert it to a quote
[cpan-testers-parsereport.git] / lib / CPAN / Testers / ParseReport.pm
blob7077a44c863f16fd41b7815b0fd4dd1c1e85de9d
1 package CPAN::Testers::ParseReport;
3 use warnings;
4 use strict;
6 use DateTime::Format::Strptime;
7 use File::Basename qw(basename);
8 use File::Path qw(mkpath);
9 use LWP::UserAgent;
10 use HTML::TreeBuilder ();
11 use XML::LibXML;
12 use XML::LibXML::XPathContext;
14 our $Signal = 0;
16 =head1 NAME
18 CPAN::Testers::ParseReport - parse reports to cpantesters.perl.org from various sources
20 =head1 VERSION
22 Version 0.0.4
24 =cut
26 use version; our $VERSION = qv('0.0.4');
29 =head1 SYNOPSIS
31 Nothing in here is meant for public consumption. Use C<ctgetreports>
32 from the commandline.
34 ctgetreports --q mod:Moose Devel-Events
36 =head1 DESCRIPTION
38 This is the core module for CPAN::Testers::ParseReport. If you're not
39 looking to extend or alter the behaviour of this module, you probably
40 want to look at C<ctgetreports> instead.
42 =head1 OPTIONS
44 Are described in the <ctgetreports> manpage and are passed through to
45 the functions unaltered.
47 =head1 FUNCTIONS
49 =head2 parse_distro($distro,$options)
51 reads the cpantesters HTML page for the distro and loops through the
52 reports for the first (usually most recent) version of that distro
53 found on that page.
55 =cut
58 my $ua;
59 sub _ua {
60 return $ua if $ua;
61 $ua = LWP::UserAgent->new;
62 $ua->parse_head(0);
63 $ua;
67 sub _download_overview {
68 my($cts_dir, $distro, %Opt) = @_;
69 my $format = $Opt{ctformat} || "html";
70 my $ctarget = "$cts_dir/$distro.$format";
71 my $cheaders = "$cts_dir/$distro.headers";
72 if (! -e $ctarget or (!$Opt{local} && -M $ctarget > .25)) {
73 if (-e $ctarget && $Opt{verbose}) {
74 my(@stat) = stat _;
75 my $timestamp = gmtime $stat[9];
76 print "(timestamp $timestamp GMT)\n";
78 print "Fetching $ctarget..." if $Opt{verbose};
79 my $resp = _ua->mirror("http://cpantesters.perl.org/show/$distro.$format",$ctarget);
80 if ($resp->is_success) {
81 print "DONE\n" if $Opt{verbose};
82 open my $fh, ">", $cheaders or die;
83 for ($resp->headers->as_string) {
84 print $fh $_;
85 if ($Opt{verbose} && $Opt{verbose}>1) {
86 print;
89 } elsif (304 == $resp->code) {
90 print "DONE (not modified)\n";
91 my $atime = my $mtime = time;
92 utime $atime, $mtime, $cheaders;
93 } else {
94 die $resp->status_line;
97 return $ctarget;
100 sub _parse_html {
101 my($ctarget, %Opt) = @_;
102 my $tree = HTML::TreeBuilder->new;
103 $tree->implicit_tags(1);
104 $tree->p_strict(1);
105 $tree->ignore_ignorable_whitespace(0);
106 my $ccontent = do { open my $fh, $ctarget or die; local $/; <$fh> };
107 $tree->parse_content($ccontent);
108 $tree->eof;
109 my $content = $tree->as_XML;
110 my $parser = XML::LibXML->new;;
111 $parser->keep_blanks(0);
112 $parser->clean_namespaces(1);
113 my $doc = eval { $parser->parse_string($content) };
114 my $err = $@;
115 unless ($doc) {
116 my $distro = basename $ctarget;
117 die sprintf "Error while parsing %s\: %s", $distro, $err;
119 $parser->clean_namespaces(1);
120 my $xc = XML::LibXML::XPathContext->new($doc);
121 my $nsu = $doc->documentElement->namespaceURI;
122 $xc->registerNs('x', $nsu) if $nsu;
123 # $DB::single++;
124 my($selected_release_ul,$selected_release_distrov,$excuse_string);
125 if ($Opt{vdistro}) {
126 $excuse_string = "selected distro '$Opt{vdistro}'";
127 ($selected_release_distrov) = $nsu ? $xc->findvalue("/x:html/x:body/x:div[\@id = 'doc']/x:div//x:h2[x:a/\@id = '$Opt{vdistro}']/x:a/\@id") :
128 $doc->findvalue("/html/body/div[\@id = 'doc']/div//h2[a/\@id = '$Opt{vdistro}']/a/\@id");
129 ($selected_release_ul) = $nsu ? $xc->findnodes("/x:html/x:body/x:div[\@id = 'doc']/x:div//x:h2[x:a/\@id = '$Opt{vdistro}']/following-sibling::ul[1]") :
130 $doc->findnodes("/html/body/div[\@id = 'doc']/div//h2[a/\@id = '$Opt{vdistro}']/following-sibling::ul[1]");
131 } else {
132 $excuse_string = "any distro";
133 ($selected_release_distrov) = $nsu ? $xc->findvalue("/x:html/x:body/x:div[\@id = 'doc']/x:div//x:h2[1]/x:a/\@id") :
134 $doc->findvalue("/html/body/div[\@id = 'doc']/div//h2[1]/a/\@id");
135 ($selected_release_ul) = $nsu ? $xc->findnodes("/x:html/x:body/x:div[\@id = 'doc']/x:div//x:ul[1]") :
136 $doc->findnodes("/html/body/div[\@id = 'doc']/div//ul[1]");
138 unless ($selected_release_distrov) {
139 warn "Warning: could not find $excuse_string in '$ctarget'";
140 return;
142 print "SELECTED: $selected_release_distrov\n";
143 my($ok,$id);
144 my @all;
145 for my $test ($nsu ? $xc->findnodes("x:li",$selected_release_ul) : $selected_release_ul->findnodes("li")) {
146 $ok = $nsu ? $xc->findvalue("x:span[1]/\@class",$test) : $test->findvalue("span[1]/\@class");
147 $id = $nsu ? $xc->findvalue("x:a[1]/text()",$test) : $test->findvalue("a[1]/text()");
148 push @all, {id=>$id,ok=>$ok};
149 return if $Signal;
151 return \@all;
154 sub _parse_yaml {
155 my($ctarget, %Opt) = @_;
156 require YAML::Syck;
157 my $arr = YAML::Syck::LoadFile($ctarget);
158 my($selected_release_ul,$selected_release_distrov,$excuse_string);
159 if ($Opt{vdistro}) {
160 $excuse_string = "selected distro '$Opt{vdistro}'";
161 $arr = [grep {$_->{distversion} eq $Opt{vdistro}} @$arr];
162 ($selected_release_distrov) = $arr->[0]{distversion};
163 } else {
164 $excuse_string = "any distro";
165 my $last_addition;
166 my %seen;
167 for my $report (@$arr) {
168 unless ($seen{$report->{distversion}}++) {
169 $last_addition = $report->{distversion};
172 $arr = [grep {$_->{distversion} eq $last_addition} @$arr];
173 ($selected_release_distrov) = $last_addition;
175 unless ($selected_release_distrov) {
176 warn "Warning: could not find $excuse_string in '$ctarget'";
177 return;
179 print "SELECTED: $selected_release_distrov\n";
180 my($ok,$id);
181 my @all;
182 for my $test (@$arr) {
183 $ok = $test->{action};
184 $id = $test->{id};
185 push @all, {id=>$id,ok=>$ok};
186 return if $Signal;
188 return \@all;
191 sub _parse_single_report {
192 my($report, $dumpvars, %Opt) = @_;
193 my($id) = $report->{id};
194 my($ok) = $report->{ok};
195 my $nnt_dir = "$Opt{cachedir}/nntp-testers";
196 mkpath $nnt_dir;
197 my $target = "$nnt_dir/$id";
198 unless (-e $target) {
199 print "Fetching $target..." if $Opt{verbose};
200 my $resp = _ua->mirror("http://www.nntp.perl.org/group/perl.cpan.testers/$id",$target);
201 if ($resp->is_success) {
202 if ($Opt{verbose}) {
203 my(@stat) = stat $target;
204 my $timestamp = gmtime $stat[9];
205 print "(timestamp $timestamp GMT)\n";
206 if ($Opt{verbose} > 1) {
207 print $resp->headers->as_string;
210 my $headers = "$target.headers";
211 open my $fh, ">", $headers or die;
212 print $fh $resp->headers->as_string;
213 } else {
214 die $resp->status_line;
217 parse_report($target, $dumpvars, %Opt);
220 sub parse_distro {
221 my($distro,%Opt) = @_;
222 my %dumpvars;
223 $Opt{cachedir} ||= "$ENV{HOME}/var/cpantesters";
224 my $cts_dir = "$Opt{cachedir}/cpantesters-show";
225 mkpath $cts_dir;
226 my $ctarget = _download_overview($cts_dir, $distro, %Opt);
227 my $reports;
228 $Opt{ctformat} ||= "html";
229 if ($Opt{ctformat} eq "html") {
230 $reports = _parse_html($ctarget);
231 } else {
232 $reports = _parse_yaml($ctarget);
234 return unless $reports;
235 for my $report (@$reports) {
236 _parse_single_report($report, \%dumpvars, %Opt);
238 if ($Opt{dumpvars}) {
239 print YAML::Syck::Dump(\%dumpvars);
243 =head2 parse_report($target,$dumpvars,%Opt)
245 Reads one report. $target is the local filename to read. $dumpvars is
246 a hashref which gets filled. %Opt are the options as described in the
247 C<ctgetreports> manpage.
249 =cut
250 sub parse_report {
251 my($target,$dumpvars,%Opt) = @_;
252 our @q;
253 my $id = basename($target);
254 my $ok;
255 open my $fh, $target or die;
256 my(%extract);
257 my $report_writer;
258 my $moduleunpack = {};
259 my $expect_prereq = 0;
260 my $expect_toolchain = 0;
261 my $expecting_toolchain_soon = 0;
263 my $in_summary = 0;
264 my $in_prg_output = 0;
266 my $current_headline;
267 my @previous_line = ""; # so we can neutralize line breaks
268 LINE: while (<$fh>) {
269 next unless /<title>(\S+)/;
270 $ok = $1;
271 last;
273 seek $fh, 0, 0;
274 LINE: while (<$fh>) {
275 chomp; # reliable line endings?
276 s/&quot;//; # HTML !!!
277 if (/^--------/ && $previous_line[-2] && $previous_line[-2] =~ /^--------/) {
278 $current_headline = $previous_line[-1];
279 if ($current_headline =~ /PROGRAM OUTPUT/) {
280 $in_prg_output = 1;
281 } else {
282 $in_prg_output = 0;
285 unless ($extract{"meta:perl"}) {
286 my $p5;
287 if (0) {
288 } elsif (/Summary of my perl5 \((.+)\) configuration:/) {
289 $p5 = $1;
290 $in_summary = 1;
292 if ($p5) {
293 my($r,$v,$s,$p);
294 if (($r,$v,$s,$p) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+) patch (\S+)/) {
295 $r =~ s/\.0//; # 5.0 6 2!
296 $extract{"meta:perl"} = "$r.$v.$s\@$p";
297 } elsif (($r,$v,$s) = $p5 =~ /revision (\S+) version (\S+) subversion (\S+)/) {
298 $r =~ s/\.0//;
299 $extract{"meta:perl"} = "$r.$v.$s";
300 } elsif (($r,$v,$s) = $p5 =~ /(\d+\S*) patchlevel (\S+) subversion (\S+)/) {
301 $r =~ s/\.0//;
302 $extract{"meta:perl"} = "$r.$v.$s";
303 } else {
304 $extract{"meta:perl"} = $p5;
308 unless ($extract{"meta:from"}) {
309 if (0) {
310 } elsif (m|<div class="h_name">From:</div> <b>(.+?)</b><br/>|) {
311 $extract{"meta:from"} = $1;
313 $extract{"meta:from"} =~ s/\.$// if $extract{"meta:from"};
315 unless ($extract{"meta:date"}) {
316 if (0) {
317 } elsif (m|<div class="h_name">Date:</div> (.+?)<br/>|) {
318 my $date = $1;
319 my $p = DateTime::Format::Strptime->new(
320 locale => "en",
321 time_zone => "UTC",
322 # April 13, 2005 23:50
323 pattern => "%b %d, %Y %R",
325 my $dt = $p->parse_datetime($date);
326 $extract{"meta:date"} = $dt->datetime;
328 $extract{"meta:date"} =~ s/\.$// if $extract{"meta:date"};
330 unless ($extract{"meta:writer"}) {
331 for ("$previous_line[-1] $_") {
332 if (0) {
333 } elsif (/created (?:automatically )?by (\S+)/) {
334 $extract{"meta:writer"} = $1;
335 } elsif (/CPANPLUS, version (\S+)/) {
336 $extract{"meta:writer"} = "CPANPLUS $1";
337 } elsif (/This report was machine-generated by CPAN::YACSmoke (\S+)/) {
338 $extract{"meta:writer"} = "CPAN::YACSmoke $1";
340 $extract{"meta:writer"} =~ s/[\.,]$// if $extract{"meta:writer"};
343 if ($in_summary) {
344 # we do that first three lines a bit too often
345 my $qr = $Opt{dumpvars} || "";
346 $qr = qr/$qr/ if $qr;
347 unless (@q) {
348 @q = @{$Opt{q}||[]};
349 @q = qw(meta:perl conf:archname conf:usethreads conf:optimize meta:writer meta:from) unless @q;
352 my %conf_vars = map {($_ => 1)} grep { /^conf:/ } @q;
354 if (/^\s*$/ || m|</pre>|) {
355 $in_summary = 0;
356 } else {
357 s/&quot;/"/g;
358 my(%kv) = /\G,?\s*([^=]+)=('[^']+?'|\S+)/gc;
359 while (my($k,$v) = each %kv) {
360 my $ck = "conf:$k";
361 $v =~ s/,$//;
362 if ($v =~ /^'(.*)'$/) {
363 $v = $1;
365 $v =~ s/^\s+//;
366 $v =~ s/\s+$//;
367 # $DB::single = $ck eq "conf:archname";
368 if ($qr && $ck =~ $qr) {
369 $dumpvars->{$ck}{$v}{$ok}++;
371 if ($conf_vars{$ck}) {
372 $extract{$ck} = $v;
377 if ($in_prg_output) {
378 unless ($extract{"meta:output_from"}) {
379 if (/Output from (.+):$/) {
380 $extract{"meta:output_from"} = $1
384 if ($expect_prereq || $expect_toolchain) {
385 if (exists $moduleunpack->{type}) {
386 my($module,$v);
387 if ($moduleunpack->{type} == 1) {
388 (my $leader,$module,undef,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
389 next LINE if $@;
390 if ($leader =~ /^-/) {
391 $moduleunpack = {};
392 $expect_prereq = 0;
393 next LINE;
394 } elsif ($leader =~ /^(
395 buil # build_requires:
396 )/x) {
397 next LINE;
398 } elsif ($module =~ /^(
399 - # line drawing
400 )/x) {
401 next LINE;
403 } elsif ($moduleunpack->{type} == 2) {
404 (my $leader,$module,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
405 next LINE if $@;
406 if ($leader =~ /^\*/) {
407 $moduleunpack = {};
408 $expect_prereq = 0;
409 next LINE;
411 } elsif ($moduleunpack->{type} == 3) {
412 (my $leader,$module,$v) = eval { unpack $moduleunpack->{tpl}, $_; };
413 next LINE if $@;
414 if (!$module) {
415 $moduleunpack = {};
416 $expect_toolchain = 0;
417 next LINE;
418 } elsif ($module =~ /^-/) {
419 next LINE;
422 $module =~ s/\s+$//;
423 if ($module) {
424 $v =~ s/^\s+//;
425 $v =~ s/\s+$//;
426 $extract{"mod:$module"} = $v;
429 if (/(\s+)(Module\s+)(Need\s+)Have/) {
430 $moduleunpack = {
431 tpl => 'a'.length($1).'a'.length($2).'a'.length($3).'a*',
432 type => 1,
434 } elsif (/(\s+)(Module Name\s+)(Have)(\s+)Want/) {
435 my $adjust_1 = 0;
436 my $adjust_2 = -length($4);
437 my $adjust_3 = length($4);
438 # two pass would be required to see where the
439 # columns really are. Or could we get away with split?
440 $moduleunpack = {
441 tpl => 'a'.length($1).'a'.(length($2)+$adjust_2).'a'.(length($3)+$adjust_3),
442 type => 2,
446 if (/PREREQUISITES|Prerequisite modules loaded/) {
447 $expect_prereq=1;
449 if ($expecting_toolchain_soon) {
450 if (/(\s+)(Module\s+) Have/) {
451 $expect_toolchain=1;
452 $expecting_toolchain_soon=0;
453 $moduleunpack = {
454 tpl => 'a'.length($1).'a'.length($2).'a*',
455 type => 3,
459 if (/toolchain versions installed/) {
460 $expecting_toolchain_soon=1;
462 push @previous_line, $_;
463 } # LINE
464 my $diag = "";
465 if (my $qr = $Opt{dumpvars}) {
466 $qr = qr/$qr/;
467 while (my($k,$v) = each %extract) {
468 if ($k =~ $qr) {
469 $dumpvars->{$k}{$v}{$ok}++;
473 for my $want (@q) {
474 my $have = $extract{$want} || "";
475 $diag .= " $want\[$have]";
477 printf " %-4s %8d%s\n", $ok, $id, $diag;
478 if ($Opt{interactive}) {
479 require IO::Prompt;
480 local @ARGV;
481 local $ARGV;
482 my $ans = IO::Prompt::prompt
484 -p => "View $id? ",
485 -d => "y",
486 -u => qr/[ynq]/,
487 -onechar,
489 print "\n";
490 if ($ans eq "y") {
491 open my $ifh, "<", $target or die "Could not open $target: $!";
492 $Opt{pager} ||= "less";
493 open my $lfh, "|-", $Opt{pager} or die "Could not fork '$Opt{pager}': $!";
494 local $/;
495 print {$lfh} <$ifh>;
496 close $ifh or die "Could not close $target: $!";
497 close $lfh or die "Could not close pager: $!"
498 } elsif ($ans eq "q") {
499 $Signal++;
500 return;
505 =head1 AUTHOR
507 Andreas Koenig
509 =head1 BUGS
511 Please report any bugs or feature requests through the web
512 interface at
513 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CPAN-Testers-ParseReport>.
514 I will be notified, and then you'll automatically be notified of
515 progress on your bug as I make changes.
517 =head1 SUPPORT
519 You can find documentation for this module with the perldoc command.
521 perldoc CPAN::Testers::ParseReport
524 You can also look for information at:
526 =over 4
528 =item * RT: CPAN's request tracker
530 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=CPAN-Testers-ParseReport>
532 =item * AnnoCPAN: Annotated CPAN documentation
534 L<http://annocpan.org/dist/CPAN-Testers-ParseReport>
536 =item * CPAN Ratings
538 L<http://cpanratings.perl.org/d/CPAN-Testers-ParseReport>
540 =item * Search CPAN
542 L<http://search.cpan.org/dist/CPAN-Testers-ParseReport>
544 =back
547 =head1 ACKNOWLEDGEMENTS
549 Thanks to RJBS for module-starter.
551 =head1 COPYRIGHT & LICENSE
553 Copyright 2008 Andreas Koenig, all rights reserved.
555 This program is free software; you can redistribute it and/or modify it
556 under the same terms as Perl itself.
559 =cut
561 1; # End of CPAN::Testers::ParseReport