21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
25 number of days to scan. Defaults to 1461.
33 Maximum number of hits to produce. Once reached, the program stops.
35 =item B<--redis-enqueue!>
37 If set, we throw each hit into the redis on localhost. Before doing so
38 we make a sanity check: analysis:jobqueue:q must exist and be a zset,
45 Find all rows on the front page that have no link in the first column
46 or have an 8-digit upload date or the comment column is empty despite
47 there is an annotation.
51 =head2 no link in first column
53 First run was on 2013-11-05 and revealed there were 600 rows without a
54 link in the first column. Nearly all of them are in the rows higher
57 Reason seems to be that some version N+1 had no fail at all and so it
58 remained undiscovered that N was suddenly outdated. Still
59 unresearched. One twist seems to be that the sample of 500 is often
60 too low. If we have only three fails and then sample to less than N,
61 we probably lose one of the three fails. We should check whether
63 NUMBER_OF_FAILS * SAMPLE_SIZE / POPULATION is > SOMEVALUE
65 where SOMEVALUE has to be > 3 to avoid random undersampling. This has
66 been fixed in ctgetreports by introducing the option C<--minpass>.
68 Also found C<AURUM/Text-Tradition-Analysis-1.1-podfix> and
69 C<DYLAN/POE-Component-Runner-0.04.b>. I didn't bother to investigate
75 Win32-FindFile-0.14-withoutworldwriteables|2
77 where we just reran a calculation with --pick and it repaired itself.
78 It seems no biggie, so skip this for now.
80 And then there is PerlIO-text-0.007 which probably was just waiting to
81 be calculated. It had only 6 fails in 18 months, so maybe was
82 discovered late and never reached by the priority queue. But maybe it
85 Here is an interesting one:
87 sqlite> select distv, greenish from distcontext where distv like 'Config-Model-2%';
96 Config-Model-2.026_1|2
99 Config-Model-2.030_01|2
105 This illustrates best that one step on the way to achieve sanity can
106 be to delete old cruft from distcontext.
108 =head2 8-digit upload date
110 Done around the caching table distlookup Jan/Feb 2014.
112 =head2 annotation not yet integrated
114 20140310: first stab at it
120 use lib
"$FindBin::Bin/../lib";
126 use File
::Basename
qw(dirname);
127 use File
::Path
qw(mkpath);
132 use Hash
::Util
qw(lock_keys);
135 lock_keys
%Opt, map { /([^=|!]+)/ } @opt;
145 if ($Opt{"redis-enqueue"}) {
148 my($type) = $redis->type("analysis:jobqueue:q");
149 die "localhost redis analysis:jobqueue:q is a '$type', not a zset" unless "zset" eq $type;
153 my $annofile = "$FindBin::Bin/../annotate.txt";
155 unless (open $fh, $annofile) {
163 ANNOLINE
: while (<$fh>) {
165 next ANNOLINE
if /^\s*$/;
166 my($distv,$splain) = split " ", $_, 2;
167 $anno->{$distv} = $splain;
173 my $ua = LWP
::UserAgent
->new();
174 # my $resp = $ua->get("http://217.199.168.174:3000/?author=&age=2922&SUBMIT_xxx=Submit");
175 my $resp = $ua->get("http://217.199.168.174:3000/?author=&age=$Opt{days}&SUBMIT_xxx=Submit");
176 # my $resp = $ua->get("http://217.199.168.174:3000/?author=&age=91.3&SUBMIT_xxx=Submit");
180 if ($resp->is_success) {
181 my $content = $resp->decoded_content;
182 my $p = XML
::LibXML
->new();
183 # loading as html complains about </p>
184 # loading as xml complained about   not being defined, but we fixed this by changing the document
185 my $doc = $p->load_xml(string
=> $content);
186 my $root = $doc->documentElement;
187 $root->setNamespace("http://www.w3.org/1999/xhtml","html",1);
189 my @row = $root->findnodes("//html:table[\@class='texttable']//html:tr");
190 ROW
: for my $row (@row) {
191 my(@td) = $row->findnodes("html:td") or next;
193 my($a) = $td1->findnodes("html:a");
194 my $repaircandidate = 0;
197 my $href = $a->getAttribute("href");
198 $href =~ s/.*?distv=//;
201 # printf "[%s]", $href;
203 $repaircandidate = 1;
206 my $td5string = $td5->textContent;
207 if ($td5string =~ /[0-9]{8}/) {
208 $repaircandidate = 1;
210 if ($distv && $anno->{$distv}) {
212 my $td7string = $td7->textContent;
213 if ($td7string =~ /^\s*$/) {
214 $repaircandidate = 1;
217 if ($repaircandidate) {
219 if ($Opt{max
} && $cnt > $Opt{max
}) {
222 my $td1string = $td1->serialize;
223 $td1string =~ s/.+<!-- //gs;
224 $td1string =~ s/ -->.+//gs;
225 $td1string =~ s/\s//g;
226 $td1string =~ s
|(.+)/||;
229 my(@td034) = @td[0,3,4];
230 for my $tdi (0..$#td034) {
231 my $td = $td034[$tdi];
232 my $string = $td->textContent;
233 $string =~ s/[^0-9]//g;
234 $td034[$tdi] = $string
236 printf "%3d %4d %4d %4d %-9s http://matrix.cpantesters.org/?dist=%s\n", $cnt, @td034, $author, $td1string;
237 if ($Opt{"redis-enqueue"}) {
238 unless (defined $highscore) {
239 (undef,$highscore) = $redis->zrevrange("analysis:jobqueue:q",0,0,"withscores");
242 $redis->zadd("analysis:jobqueue:q",$highscore,$td1string);
247 warn sprintf "Code: %s\n", $resp->code;
253 # cperl-indent-level: 4