15 refill-cpanstatsdb-minutes.pl directory
23 my @opt = <<'=back' =~ /B<--(\S+)>/g;
29 =item B<--collectguids!>
31 Try to identify the guids on amazon side that are missing on
32 cpantesters side. Report them together with the record in
33 tab-separated format. (Misnomer: originally we reported only the guid
34 but it turned out the guid was not usable)
38 Limits the investigation to a certain date. Format must be
39 left-aligned, e.g. C<20121008>. Note that results are unreliable for
40 dates lower than 20121008 and that at least 15 hours should have
41 passed since the end of the picked day because analysis fetches fresh
42 data from upstream only every 15 hours and then needs about 6 hours
43 for processing. So better is 21 hours, and give room for unanticipated
48 Defaults to 2_800_000_000. The logfile is huge and skipping 2.8 G
49 helps run times enourmously. On the relation to dates:
51 2700000000 2013-12-16T21:46:30Z
52 2800000000 2014-01-02T06:34:34Z
53 2900000000 2014-01-17T11:43:43Z
54 3000000000 2014-01-30T15:29:56Z
55 3100000000 2014-02-14T16:14:52Z
56 11099702509 2017-04-17T11:26:33Z
58 IOW: 100_000_000 ≙ (U+2259 ≙ estimates) 2 weeks unchanges over the
65 Opens directory, looks into all json files that describe recent
66 cpanstats records, collects all minutes and counts number of reports
67 in amazon and in cpanstatsdb. With the --collectguids argument it also
68 lists guids on the amazon side that are either definitively missing or
69 are victims of some confusion. Confusions can arise because the source
70 of the amazon data comes from log.txt which does not provide
71 osversion. Or because testers send duplicates. Or because somebody
72 calculates the time wrongly. We're not sure. Some of the GUIDs we
73 report are most probably not missings. On initial inspection this
74 seems to be in the 1 to 15 percent range.
76 =head1 HISTORICAL EVIDENCE
78 2013-04-29 21:25 I called this program with
80 andreas@ds8143:~/src/andk-cpan-tools% /home/andreas/src/installed-perls/v5.16.0/4e6d/bin/perl bin/refill-cpanstatsdb-minutes.pl --pickdate=20130422 ~/var/refill-cpanstatsdb/2013/04 | perl -nale 'printf "%d %d\n", $F[2], $S+=$F[2] if $F[0] ge "201304220623" and $F[0] le "201304221306"'
82 to count the number of reports in a specific range that I had spotted
83 on a first run piping to "less".
85 Revisiting my history I find
87 2012-10-17 07:17 /home/andreas/src/installed-perls/v5.16.0/4e6d/bin/perl -d /tmp/refill-cpanstatsdb-minutes.pl --pickdate=201210082202 --collectguids ~/var/refill-cpanstatsdb/2012/10
89 Interesting to see a longer argument to pickdate.
95 use lib
"$FindBin::Bin/../lib";
101 use File
::Basename
qw(dirname);
102 use File
::Path
qw(mkpath);
106 use Hash
::Util
qw(lock_keys);
107 use CPAN
::DistnameInfo
;
110 lock_keys
%Opt, map { /([^=!]+)/ } @opt;
115 if ( $Opt{pickdate
} ) {
116 die "Invalid pickdate[$Opt{pickdate}]" unless $Opt{pickdate
} =~ /^[1-9][0-9]+$/
118 $Opt{seek} //= 2_800_000_000
;
121 use List
::Util
qw(minstr maxstr);
123 our $jsonxs = JSON
::XS
->new->indent(0);
125 my $dir = shift or pod2usage
(1);
126 opendir my $dh, $dir or die "Could not opendir '$dir': $!";
127 my @jsonfiles = sort grep /\.json\.gz$/, readdir $dh;
129 my $delta_records = 0;
130 for my $dirent (@jsonfiles) {
131 my $abs = "$dir/$dirent";
132 next unless -M
$abs > 1/12; # must be older than two hours to prevent reading unfinished files
133 open my $fh, "-|", "zcat" => $abs or die "Could not fork: $!";
134 # {"version":"0.003","dist":"Test-Spec-RMock","osvers":"2.11","state":"pass","perl":"5.14.1","fulldate":"201210070645","osname":"solaris","postdate":"201210","platform":"i86pc-solaris-thread-multi-64int","guid":"8d4da104-104a-11e2-bdcc-373e3b6b8117","id":"23770157","type":"2","tester":"root@klanker.bingosnet.co.uk ((Charlie &))"}
136 my $report = $jsonxs->decode($_);
137 my $minute = $report->{fulldate
};
139 if (!$Opt{pickdate
} || $Opt{pickdate
} && $minute =~ /^$Opt{pickdate}/) {
140 $S{$minute}{cpandbdelta
}++;
141 if ( $Opt{collectguids
} ) {
142 my($canon) = join "\t", @
{$report}{qw(state platform dist version perl)};
143 $S{$minute}{cpandbdeltarecord
}{$canon}++;
147 close $fh or die "Could not close 'zcat ...': $!";
149 my $found_records = keys %S;
150 if ( $found_records > 0 ) {
151 my $mindate = minstr
keys %S;
152 my $maxdate = maxstr
keys %S;
153 my $mblog = "$ENV{HOME}/var/metabase-log/metabase.log";
154 open my $fh, $mblog or die "Could not open '$mblog': $!";
156 seek $fh, $Opt{seek}, 0;
157 <$fh>; # throw away this one
159 #[2012-10-07T06:45:15Z] [Chris Williams (BINGOS)] [pass] [KJELLM/Test-Spec-RMock-0.003.tar.gz] [i86pc-solaris-thread-multi-64int] [perl-v5.14.1] [8d4e0766-104a-11e2-bdcc-373e3b6b8117] [2012-10-07T06:45:15Z]
160 #[2012-10-04T07:14:06Z] [Chris Williams (BINGOS)] [pass] [TOBYINK/P5U-Command-Peek-0.001.tar.gz] [x86_64-linux-thread-multi-ld] [perl-v5.12.4] [15dfc45c-0df3-11e2-bdcc-373e3b6b8117] [2012-10-04T07:14:06Z]
163 next if /^\.\.\./; # we know that we missed something upstream
164 my($date, $author, $state, $distro, $platform, $perl, $guid) =
172 /x
or die "non-matching line: '$_'\n";
173 next if $mbseen{$guid}++;
174 $date =~ s/:\d\dZ$//; # cut seconds off
175 $date =~ s/[^0-9]//g; # remove [-T:]
176 next unless $date ge $mindate && $date le $maxdate;
178 if ( $Opt{collectguids
} ) {
179 my $d = CPAN
::DistnameInfo
->new($distro);
180 my($shortperl) = $perl =~ /perl-v(.+)/;
181 my $canon = join "\t", $state, $platform, $d->dist, $d->version, $shortperl;
182 $S{$date}{mblogrecord
}{$canon}{$guid}++;
185 my @fields = qw(mblog cpandbdelta);
186 my %SUM = map { ($_ => 0) } @fields;
188 for my $m (sort { $a <=> $b } keys %S) {
190 $S{$m}{cpandbdelta
} ||= 0;
193 printf "%s %5d %5d %5d\n", $m, $S{$m}{mblog
}, $S{$m}{cpandbdelta
}, $S{$m}{mblog
}-$S{$m}{cpandbdelta
};
194 my $miss = $S{$m}{mblog
}-$S{$m}{cpandbdelta
};
196 if ( $Opt{collectguids
} ) {
197 for my $k (keys %{$S{$m}{mblogrecord
}}) {
198 # $k is a canonized record; mblog stands for metabase log
199 my $v = $S{$m}{mblogrecord
}{$k};
200 # $v is a hashref, the keys are guids we might want to publish
201 my $cnt_mb = keys %$v;
202 my $cnt_ct = $S{$m}{cpandbdeltarecord
}{$k} || 0;
203 if ( $cnt_mb > $cnt_ct ) {
204 for my $k2 (keys %$v) {
205 printf " %d(%d): %s\t%s\n", ++$i, ++$j, $k2, $k;
211 for my $k (@fields) {
212 $SUM{$k} += $S{$m}{$k};
217 printf ".........SUM %5d %5d %5d\n", $SUM{mblog
}, $SUM{cpandbdelta
}, $SUM{mblog
}-$SUM{cpandbdelta
};
222 "None of the %d records among the jsonfiles (\n%s) was matching pickdate '%s'",
224 join("",map(" $_\n",@jsonfiles)),
231 # cperl-indent-level: 4