Routinely eliminating annotations about probably outdated reports
[andk-cpan-tools.git] / bin / parse-analysis-index.pl
blob6b8925109b478864cf74f423edaf07a3d624fd91
1 #!/usr/bin/perl
3 # use 5.010;
4 use strict;
5 use warnings;
7 =head1 NAME
11 =head1 SYNOPSIS
15 =head1 OPTIONS
17 =over 8
19 =cut
21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
23 =item B<--days=f>
25 number of days to scan. Defaults to 1461.
27 =item B<--help|h!>
29 This help
31 =item B<--max=i>
33 Maximum number of hits to produce. Once reached, the program stops.
35 =item B<--redis-enqueue!>
37 If set, we throw each hit into the redis on localhost. Before doing so
38 we make a sanity check: analysis:jobqueue:q must exist and be a zset,
39 otherwise we die.
41 =back
43 =head1 DESCRIPTION
45 Find all rows on the front page that have no link in the first column
46 or have an 8-digit upload date or the comment column is empty despite
47 there is an annotation.
49 =head1 HISTORY
51 =head2 no link in first column
53 First run was on 2013-11-05 and revealed there were 600 rows without a
54 link in the first column. Nearly all of them are in the rows higher
55 than 2000.
57 Reason seems to be that some version N+1 had no fail at all and so it
58 remained undiscovered that N was suddenly outdated. Still
59 unresearched. One twist seems to be that the sample of 500 is often
60 too low. If we have only three fails and then sample to less than N,
61 we probably lose one of the three fails. We should check whether
63 NUMBER_OF_FAILS * SAMPLE_SIZE / POPULATION is > SOMEVALUE
65 where SOMEVALUE has to be > 3 to avoid random undersampling. This has
66 been fixed in ctgetreports by introducing the option C<--minpass>.
68 Also found C<AURUM/Text-Tradition-Analysis-1.1-podfix> and
69 C<DYLAN/POE-Component-Runner-0.04.b>. I didn't bother to investigate
70 here for now.
72 And also
74 Win32-FindFile-0.14|3
75 Win32-FindFile-0.14-withoutworldwriteables|2
77 where we just reran a calculation with --pick and it repaired itself.
78 It seems no biggie, so skip this for now.
80 And then there is PerlIO-text-0.007 which probably was just waiting to
81 be calculated. It had only 6 fails in 18 months, so maybe was
82 discovered late and never reached by the priority queue. But maybe it
83 was something else.
85 Here is an interesting one:
87 sqlite> select distv, greenish from distcontext where distv like 'Config-Model-2%';
88 Config-Model-2.002|2
89 Config-Model-2.005|2
90 Config-Model-2.013|3
91 Config-Model-2.014|2
92 Config-Model-2.017|2
93 Config-Model-2.015|1
94 Config-Model-2.024|1
95 Config-Model-2.025|2
96 Config-Model-2.026_1|2
97 Config-Model-2.010|
98 Config-Model-2.029|2
99 Config-Model-2.030_01|2
100 Config-Model-2.037|2
101 Config-Model-2.038|2
102 Config-Model-2.039|1
103 Config-Model-2.042|1
105 This illustrates best that one step on the way to achieve sanity can
106 be to delete old cruft from distcontext.
108 =head2 8-digit upload date
110 Done around the caching table distlookup Jan/Feb 2014.
112 =head2 annotation not yet integrated
114 20140310: first stab at it
116 =cut
119 use FindBin;
120 use lib "$FindBin::Bin/../lib";
121 BEGIN {
122 push @INC, qw( );
125 use Dumpvalue;
126 use File::Basename qw(dirname);
127 use File::Path qw(mkpath);
128 use File::Spec;
129 use File::Temp;
130 use Getopt::Long;
131 use Pod::Usage;
132 use Hash::Util qw(lock_keys);
134 our %Opt;
135 lock_keys %Opt, map { /([^=|!]+)/ } @opt;
136 GetOptions(\%Opt,
137 @opt,
138 ) or pod2usage(1);
139 if ($Opt{help}) {
140 pod2usage(0);
143 $Opt{days} ||= 1461;
144 my $redis;
145 if ($Opt{"redis-enqueue"}) {
146 require Redis;
147 $redis = Redis->new;
148 my($type) = $redis->type("analysis:jobqueue:q");
149 die "localhost redis analysis:jobqueue:q is a '$type', not a zset" unless "zset" eq $type;
151 my $anno;
153 my $annofile = "$FindBin::Bin/../annotate.txt";
154 my $fh;
155 unless (open $fh, $annofile) {
156 # $DB::single=1;
157 die "Could not";
160 $anno = {};
161 local $/ = "\n";
162 my $i = 0;
163 ANNOLINE: while (<$fh>) {
164 chomp;
165 next ANNOLINE if /^\s*$/;
166 my($distv,$splain) = split " ", $_, 2;
167 $anno->{$distv} = $splain;
171 use LWP::UserAgent;
172 use XML::LibXML;
173 my $ua = LWP::UserAgent->new();
174 # my $resp = $ua->get("http://217.199.168.174:3000/?author=&age=2922&SUBMIT_xxx=Submit");
175 my $resp = $ua->get("http://217.199.168.174:3000/?author=&age=$Opt{days}&SUBMIT_xxx=Submit");
176 # my $resp = $ua->get("http://217.199.168.174:3000/?author=&age=91.3&SUBMIT_xxx=Submit");
177 $|=1;
178 my $cnt = 0;
179 my $highscore;
180 if ($resp->is_success) {
181 my $content = $resp->decoded_content;
182 my $p = XML::LibXML->new();
183 # loading as html complains about </p>
184 # loading as xml complained about &nbsp not being defined, but we fixed this by changing the document
185 my $doc = $p->load_xml(string => $content);
186 my $root = $doc->documentElement;
187 $root->setNamespace("http://www.w3.org/1999/xhtml","html",1);
188 # $DB::single=1;
189 my @row = $root->findnodes("//html:table[\@class='texttable']//html:tr");
190 ROW: for my $row (@row) {
191 my(@td) = $row->findnodes("html:td") or next;
192 my $td1 = $td[1];
193 my($a) = $td1->findnodes("html:a");
194 my $repaircandidate = 0;
195 my $distv;
196 if ($a) {
197 my $href = $a->getAttribute("href");
198 $href =~ s/.*?distv=//;
199 $distv = $href;
200 $distv =~ s{.+/}{};
201 # printf "[%s]", $href;
202 } else {
203 $repaircandidate = 1;
205 my $td5 = $td[5];
206 my $td5string = $td5->textContent;
207 if ($td5string =~ /[0-9]{8}/) {
208 $repaircandidate = 1;
210 if ($distv && $anno->{$distv}) {
211 my $td7 = $td[7];
212 my $td7string = $td7->textContent;
213 if ($td7string =~ /^\s*$/) {
214 $repaircandidate = 1;
217 if ($repaircandidate) {
218 $cnt++;
219 if ($Opt{max} && $cnt > $Opt{max}) {
220 last ROW;
222 my $td1string = $td1->serialize;
223 $td1string =~ s/.+<!-- //gs;
224 $td1string =~ s/ -->.+//gs;
225 $td1string =~ s/\s//g;
226 $td1string =~ s|(.+)/||;
227 my $author = $1;
228 $author =~ s|/.+||;
229 my(@td034) = @td[0,3,4];
230 for my $tdi (0..$#td034) {
231 my $td = $td034[$tdi];
232 my $string = $td->textContent;
233 $string =~ s/[^0-9]//g;
234 $td034[$tdi] = $string
236 printf "%3d %4d %4d %4d %-9s http://matrix.cpantesters.org/?dist=%s\n", $cnt, @td034, $author, $td1string;
237 if ($Opt{"redis-enqueue"}) {
238 unless (defined $highscore) {
239 (undef,$highscore) = $redis->zrevrange("analysis:jobqueue:q",0,0,"withscores");
240 $highscore ||= 1;
242 $redis->zadd("analysis:jobqueue:q",$highscore,$td1string);
246 } else {
247 warn sprintf "Code: %s\n", $resp->code;
248 die $resp->content;
251 # Local Variables:
252 # mode: cperl
253 # cperl-indent-level: 4
254 # End: