new perls v5.39.10
[andk-cpan-tools.git] / bin / refill-cpanstatsdb-minutes.pl
blob8e87a3531120dad54a747e4538b3a18bfb0c0dd3
1 #!/usr/bin/perl
3 # use 5.010;
4 use strict;
5 use warnings;
7 =encoding utf8
9 =head1 NAME
13 =head1 SYNOPSIS
15 refill-cpanstatsdb-minutes.pl directory
17 =head1 OPTIONS
19 =over 8
21 =cut
23 my @opt = <<'=back' =~ /B<--(\S+)>/g;
25 =item B<--help|h!>
27 This help
29 =item B<--collectguids!>
31 Try to identify the guids on amazon side that are missing on
32 cpantesters side. Report them together with the record in
33 tab-separated format. (Misnomer: originally we reported only the guid
34 but it turned out the guid was not usable)
36 =item B<--pickdate=s>
38 Limits the investigation to a certain date. Format must be
39 left-aligned, e.g. C<20121008>. Note that results are unreliable for
40 dates lower than 20121008 and that at least 15 hours should have
41 passed since the end of the picked day because analysis fetches fresh
42 data from upstream only every 15 hours and then needs about 6 hours
43 for processing. So better is 21 hours, and give room for unanticipated
44 stuff.
46 =item B<--seek=i>
48 Defaults to 2_800_000_000. The logfile is huge and skipping 2.8 G
49 helps run times enourmously. On the relation to dates:
51 2700000000 2013-12-16T21:46:30Z
52 2800000000 2014-01-02T06:34:34Z
53 2900000000 2014-01-17T11:43:43Z
54 3000000000 2014-01-30T15:29:56Z
55 3100000000 2014-02-14T16:14:52Z
56 11099702509 2017-04-17T11:26:33Z
58 IOW: 100_000_000 ≙ (U+2259 ≙ estimates) 2 weeks unchanges over the
59 years.
61 =back
63 =head1 DESCRIPTION
65 Opens directory, looks into all json files that describe recent
66 cpanstats records, collects all minutes and counts number of reports
67 in amazon and in cpanstatsdb. With the --collectguids argument it also
68 lists guids on the amazon side that are either definitively missing or
69 are victims of some confusion. Confusions can arise because the source
70 of the amazon data comes from log.txt which does not provide
71 osversion. Or because testers send duplicates. Or because somebody
72 calculates the time wrongly. We're not sure. Some of the GUIDs we
73 report are most probably not missings. On initial inspection this
74 seems to be in the 1 to 15 percent range.
76 =head1 HISTORICAL EVIDENCE
78 2013-04-29 21:25 I called this program with
80 andreas@ds8143:~/src/andk-cpan-tools% /home/andreas/src/installed-perls/v5.16.0/4e6d/bin/perl bin/refill-cpanstatsdb-minutes.pl --pickdate=20130422 ~/var/refill-cpanstatsdb/2013/04 | perl -nale 'printf "%d %d\n", $F[2], $S+=$F[2] if $F[0] ge "201304220623" and $F[0] le "201304221306"'
82 to count the number of reports in a specific range that I had spotted
83 on a first run piping to "less".
85 Revisiting my history I find
87 2012-10-17 07:17 /home/andreas/src/installed-perls/v5.16.0/4e6d/bin/perl -d /tmp/refill-cpanstatsdb-minutes.pl --pickdate=201210082202 --collectguids ~/var/refill-cpanstatsdb/2012/10
89 Interesting to see a longer argument to pickdate.
91 =cut
94 use FindBin;
95 use lib "$FindBin::Bin/../lib";
96 BEGIN {
97 push @INC, qw( );
100 use Dumpvalue;
101 use File::Basename qw(dirname);
102 use File::Path qw(mkpath);
103 use File::Spec;
104 use File::Temp;
105 use Getopt::Long;
106 use Hash::Util qw(lock_keys);
107 use CPAN::DistnameInfo;
109 our %Opt;
110 lock_keys %Opt, map { /([^=!]+)/ } @opt;
111 GetOptions(\%Opt,
112 @opt,
113 ) or pod2usage(1);
115 if ( $Opt{pickdate} ) {
116 die "Invalid pickdate[$Opt{pickdate}]" unless $Opt{pickdate} =~ /^[1-9][0-9]+$/
118 $Opt{seek} //= 2_800_000_000;
120 use JSON::XS ();
121 use List::Util qw(minstr maxstr);
122 use DBD::SQLite;
123 our $jsonxs = JSON::XS->new->indent(0);
125 my $dir = shift or pod2usage(1);
126 opendir my $dh, $dir or die "Could not opendir '$dir': $!";
127 my @jsonfiles = sort grep /\.json\.gz$/, readdir $dh;
128 my %S;
129 my $delta_records = 0;
130 for my $dirent (@jsonfiles) {
131 my $abs = "$dir/$dirent";
132 next unless -M $abs > 1/12; # must be older than two hours to prevent reading unfinished files
133 open my $fh, "-|", "zcat" => $abs or die "Could not fork: $!";
134 # {"version":"0.003","dist":"Test-Spec-RMock","osvers":"2.11","state":"pass","perl":"5.14.1","fulldate":"201210070645","osname":"solaris","postdate":"201210","platform":"i86pc-solaris-thread-multi-64int","guid":"8d4da104-104a-11e2-bdcc-373e3b6b8117","id":"23770157","type":"2","tester":"root@klanker.bingosnet.co.uk ((Charlie &))"}
135 while (<$fh>) {
136 my $report = $jsonxs->decode($_);
137 my $minute = $report->{fulldate};
138 $delta_records++;
139 if (!$Opt{pickdate} || $Opt{pickdate} && $minute =~ /^$Opt{pickdate}/) {
140 $S{$minute}{cpandbdelta}++;
141 if ( $Opt{collectguids} ) {
142 my($canon) = join "\t", @{$report}{qw(state platform dist version perl)};
143 $S{$minute}{cpandbdeltarecord}{$canon}++;
147 close $fh or die "Could not close 'zcat ...': $!";
149 my $found_records = keys %S;
150 if ( $found_records > 0 ) {
151 my $mindate = minstr keys %S;
152 my $maxdate = maxstr keys %S;
153 my $mblog = "$ENV{HOME}/var/metabase-log/metabase.log";
154 open my $fh, $mblog or die "Could not open '$mblog': $!";
155 if ($Opt{seek}) {
156 seek $fh, $Opt{seek}, 0;
157 <$fh>; # throw away this one
159 #[2012-10-07T06:45:15Z] [Chris Williams (BINGOS)] [pass] [KJELLM/Test-Spec-RMock-0.003.tar.gz] [i86pc-solaris-thread-multi-64int] [perl-v5.14.1] [8d4e0766-104a-11e2-bdcc-373e3b6b8117] [2012-10-07T06:45:15Z]
160 #[2012-10-04T07:14:06Z] [Chris Williams (BINGOS)] [pass] [TOBYINK/P5U-Command-Peek-0.001.tar.gz] [x86_64-linux-thread-multi-ld] [perl-v5.12.4] [15dfc45c-0df3-11e2-bdcc-373e3b6b8117] [2012-10-04T07:14:06Z]
161 my %mbseen;
162 while (<$fh>) {
163 next if /^\.\.\./; # we know that we missed something upstream
164 my($date, $author, $state, $distro, $platform, $perl, $guid) =
165 /^\[([^\]]*)\]
166 \s+\[([^\]]*)\]
167 \s+\[([^\]]*)\]
168 \s+\[([^\]]*)\]
169 \s+\[([^\]]*)\]
170 \s+\[([^\]]*)\]
171 \s+\[([^\]]*)\]
172 /x or die "non-matching line: '$_'\n";
173 next if $mbseen{$guid}++;
174 $date =~ s/:\d\dZ$//; # cut seconds off
175 $date =~ s/[^0-9]//g; # remove [-T:]
176 next unless $date ge $mindate && $date le $maxdate;
177 $S{$date}{mblog}++;
178 if ( $Opt{collectguids} ) {
179 my $d = CPAN::DistnameInfo->new($distro);
180 my($shortperl) = $perl =~ /perl-v(.+)/;
181 my $canon = join "\t", $state, $platform, $d->dist, $d->version, $shortperl;
182 $S{$date}{mblogrecord}{$canon}{$guid}++;
185 my @fields = qw(mblog cpandbdelta);
186 my %SUM = map { ($_ => 0) } @fields;
187 my $j = 0;
188 for my $m (sort { $a <=> $b } keys %S) {
189 $S{$m}{mblog} ||= 0;
190 $S{$m}{cpandbdelta} ||= 0;
191 if (0) {
192 } else {
193 printf "%s %5d %5d %5d\n", $m, $S{$m}{mblog}, $S{$m}{cpandbdelta}, $S{$m}{mblog}-$S{$m}{cpandbdelta};
194 my $miss = $S{$m}{mblog}-$S{$m}{cpandbdelta};
195 my $i = 0;
196 if ( $Opt{collectguids} ) {
197 for my $k (keys %{$S{$m}{mblogrecord}}) {
198 # $k is a canonized record; mblog stands for metabase log
199 my $v = $S{$m}{mblogrecord}{$k};
200 # $v is a hashref, the keys are guids we might want to publish
201 my $cnt_mb = keys %$v;
202 my $cnt_ct = $S{$m}{cpandbdeltarecord}{$k} || 0;
203 if ( $cnt_mb > $cnt_ct ) {
204 for my $k2 (keys %$v) {
205 printf " %d(%d): %s\t%s\n", ++$i, ++$j, $k2, $k;
211 for my $k (@fields) {
212 $SUM{$k} += $S{$m}{$k};
215 if (0) {
216 } else {
217 printf ".........SUM %5d %5d %5d\n", $SUM{mblog}, $SUM{cpandbdelta}, $SUM{mblog}-$SUM{cpandbdelta};
219 } else {
220 warn sprintf
222 "None of the %d records among the jsonfiles (\n%s) was matching pickdate '%s'",
223 $delta_records,
224 join("",map(" $_\n",@jsonfiles)),
225 $Opt{pickdate},
229 # Local Variables:
230 # mode: cperl
231 # cperl-indent-level: 4
232 # End: