bin/refill-cpanstatsdb-minutes.pl

   1 #!/usr/bin/perl
   2
   3 # use 5.010;
   4 use strict;
   5 use warnings;
   6
   7 =encoding utf8
   8
   9 =head1 NAME
  10
  11
  12
  13 =head1 SYNOPSIS
  14
  15  refill-cpanstatsdb-minutes.pl directory
  16
  17 =head1 OPTIONS
  18
  19 =over 8
  20
  21 =cut
  22
  23 my @opt = <<'=back' =~ /B<--(\S+)>/g;
  24
  25 =item B<--help|h!>
  26
  27 This help
  28
  29 =item B<--collectguids!>
  30
  31 Try to identify the guids on amazon side that are missing on
  32 cpantesters side. Report them together with the record in
  33 tab-separated format. (Misnomer: originally we reported only the guid
  34 but it turned out the guid was not usable)
  35
  36 =item B<--pickdate=s>
  37
  38 Limits the investigation to a certain date. Format must be
  39 left-aligned, e.g. C<20121008>. Note that results are unreliable for
  40 dates lower than 20121008 and that at least 15 hours should have
  41 passed since the end of the picked day because analysis fetches fresh
  42 data from upstream only every 15 hours and then needs about 6 hours
  43 for processing. So better is 21 hours, and give room for unanticipated
  44 stuff.
  45
  46 =item B<--seek=i>
  47
  48 Defaults to 2_800_000_000. The logfile is huge and skipping 2.8 G
  49 helps run times enourmously. On the relation to dates:
  50
  51   2700000000 2013-12-16T21:46:30Z
  52   2800000000 2014-01-02T06:34:34Z
  53   2900000000 2014-01-17T11:43:43Z
  54   3000000000 2014-01-30T15:29:56Z
  55   3100000000 2014-02-14T16:14:52Z
  56  11099702509 2017-04-17T11:26:33Z
  57
  58 IOW: 100_000_000 ≙ (U+2259 ≙ estimates) 2 weeks unchanges over the
  59 years.
  60
  61 =back
  62
  63 =head1 DESCRIPTION
  64
  65 Opens directory, looks into all json files that describe recent
  66 cpanstats records, collects all minutes and counts number of reports
  67 in amazon and in cpanstatsdb. With the --collectguids argument it also
  68 lists guids on the amazon side that are either definitively missing or
  69 are victims of some confusion. Confusions can arise because the source
  70 of the amazon data comes from log.txt which does not provide
  71 osversion. Or because testers send duplicates. Or because somebody
  72 calculates the time wrongly. We're not sure. Some of the GUIDs we
  73 report are most probably not missings. On initial inspection this
  74 seems to be in the 1 to 15 percent range.
  75
  76 =head1 HISTORICAL EVIDENCE
  77
  78 2013-04-29 21:25 I called this program with
  79
  80 andreas@ds8143:~/src/andk-cpan-tools% /home/andreas/src/installed-perls/v5.16.0/4e6d/bin/perl bin/refill-cpanstatsdb-minutes.pl --pickdate=20130422 ~/var/refill-cpanstatsdb/2013/04 | perl -nale 'printf "%d %d\n", $F[2], $S+=$F[2] if $F[0] ge "201304220623" and $F[0] le "201304221306"'
  81
  82 to count the number of reports in a specific range that I had spotted
  83 on a first run piping to "less".
  84
  85 Revisiting my history I find
  86
  87 2012-10-17 07:17  /home/andreas/src/installed-perls/v5.16.0/4e6d/bin/perl -d /tmp/refill-cpanstatsdb-minutes.pl --pickdate=201210082202 --collectguids ~/var/refill-cpanstatsdb/2012/10
  88
  89 Interesting to see a longer argument to pickdate.
  90
  91 =cut
  92
  93
  94 use FindBin;
  95 use lib "$FindBin::Bin/../lib";
  96 BEGIN {
  97     push @INC, qw(       );
  98 }
  99
 100 use Dumpvalue;
 101 use File::Basename qw(dirname);
 102 use File::Path qw(mkpath);
 103 use File::Spec;
 104 use File::Temp;
 105 use Getopt::Long;
 106 use Hash::Util qw(lock_keys);
 107 use CPAN::DistnameInfo;
 108
 109 our %Opt;
 110 lock_keys %Opt, map { /([^=!]+)/ } @opt;
 111 GetOptions(\%Opt,
 112            @opt,
 113           ) or pod2usage(1);
 114
 115 if ( $Opt{pickdate} ) {
 116     die "Invalid pickdate[$Opt{pickdate}]" unless $Opt{pickdate} =~ /^[1-9][0-9]+$/
 117 }
 118 $Opt{seek} //= 2_800_000_000;
 119
 120 use JSON::XS ();
 121 use List::Util qw(minstr maxstr);
 122 use DBD::SQLite;
 123 our $jsonxs = JSON::XS->new->indent(0);
 124
 125 my $dir = shift or pod2usage(1);
 126 opendir my $dh, $dir or die "Could not opendir '$dir': $!";
 127 my @jsonfiles = sort grep /\.json\.gz$/, readdir $dh;
 128 my %S;
 129 my $delta_records = 0;
 130 for my $dirent (@jsonfiles) {
 131     my $abs = "$dir/$dirent";
 132     next unless -M $abs > 1/12; # must be older than two hours to prevent reading unfinished files
 133     open my $fh, "-|", "zcat" => $abs or die "Could not fork: $!";
 134     # {"version":"0.003","dist":"Test-Spec-RMock","osvers":"2.11","state":"pass","perl":"5.14.1","fulldate":"201210070645","osname":"solaris","postdate":"201210","platform":"i86pc-solaris-thread-multi-64int","guid":"8d4da104-104a-11e2-bdcc-373e3b6b8117","id":"23770157","type":"2","tester":"root@klanker.bingosnet.co.uk ((Charlie &))"}
 135     while (<$fh>) {
 136         my $report = $jsonxs->decode($_);
 137         my $minute = $report->{fulldate};
 138         $delta_records++;
 139         if (!$Opt{pickdate} || $Opt{pickdate} && $minute =~ /^$Opt{pickdate}/) {
 140             $S{$minute}{cpandbdelta}++;
 141             if ( $Opt{collectguids} ) {
 142                 my($canon) = join "\t", @{$report}{qw(state platform dist version perl)};
 143                 $S{$minute}{cpandbdeltarecord}{$canon}++;
 144             }
 145         }
 146     }
 147     close $fh or die "Could not close 'zcat ...': $!";
 148 }
 149 my $found_records = keys %S;
 150 if ( $found_records > 0 ) {
 151     my $mindate = minstr keys %S;
 152     my $maxdate = maxstr keys %S;
 153     my $mblog = "$ENV{HOME}/var/metabase-log/metabase.log";
 154     open my $fh, $mblog or die "Could not open '$mblog': $!";
 155     if ($Opt{seek}) {
 156         seek $fh, $Opt{seek}, 0;
 157         <$fh>; # throw away this one
 158     }
 159     #[2012-10-07T06:45:15Z] [Chris Williams (BINGOS)] [pass] [KJELLM/Test-Spec-RMock-0.003.tar.gz] [i86pc-solaris-thread-multi-64int] [perl-v5.14.1] [8d4e0766-104a-11e2-bdcc-373e3b6b8117] [2012-10-07T06:45:15Z]
 160     #[2012-10-04T07:14:06Z] [Chris Williams (BINGOS)] [pass] [TOBYINK/P5U-Command-Peek-0.001.tar.gz] [x86_64-linux-thread-multi-ld] [perl-v5.12.4] [15dfc45c-0df3-11e2-bdcc-373e3b6b8117] [2012-10-04T07:14:06Z]
 161     my %mbseen;
 162     while (<$fh>) {
 163         next if /^\.\.\./; # we know that we missed something upstream
 164         my($date, $author, $state, $distro, $platform, $perl, $guid) =
 165             /^\[([^\]]*)\]
 166              \s+\[([^\]]*)\]
 167              \s+\[([^\]]*)\]
 168              \s+\[([^\]]*)\]
 169              \s+\[([^\]]*)\]
 170              \s+\[([^\]]*)\]
 171              \s+\[([^\]]*)\]
 172             /x or die "non-matching line: '$_'\n";
 173         next if $mbseen{$guid}++;
 174         $date =~ s/:\d\dZ$//; # cut seconds off
 175         $date =~ s/[^0-9]//g; # remove [-T:]
 176         next unless $date ge $mindate && $date le $maxdate;
 177         $S{$date}{mblog}++;
 178         if ( $Opt{collectguids} ) {
 179             my $d = CPAN::DistnameInfo->new($distro);
 180             my($shortperl) = $perl =~ /perl-v(.+)/;
 181             my $canon = join "\t", $state, $platform, $d->dist, $d->version, $shortperl;
 182             $S{$date}{mblogrecord}{$canon}{$guid}++;
 183         }
 184     }
 185     my @fields = qw(mblog cpandbdelta);
 186     my %SUM = map { ($_ => 0) } @fields;
 187     my $j = 0;
 188     for my $m (sort { $a <=> $b } keys %S) {
 189         $S{$m}{mblog} ||= 0;
 190         $S{$m}{cpandbdelta} ||= 0;
 191         if (0) {
 192         } else {
 193             printf "%s %5d %5d %5d\n", $m, $S{$m}{mblog}, $S{$m}{cpandbdelta}, $S{$m}{mblog}-$S{$m}{cpandbdelta};
 194             my $miss = $S{$m}{mblog}-$S{$m}{cpandbdelta};
 195             my $i = 0;
 196             if ( $Opt{collectguids} ) {
 197                 for my $k (keys %{$S{$m}{mblogrecord}}) {
 198                     # $k is a canonized record; mblog stands for metabase log
 199                     my $v = $S{$m}{mblogrecord}{$k};
 200                     # $v is a hashref, the keys are guids we might want to publish
 201                     my $cnt_mb = keys %$v;
 202                     my $cnt_ct = $S{$m}{cpandbdeltarecord}{$k} || 0;
 203                     if ( $cnt_mb > $cnt_ct ) {
 204                         for my $k2 (keys %$v) {
 205                             printf "  %d(%d): %s\t%s\n", ++$i, ++$j, $k2, $k;
 206                         }
 207                     }
 208                 }
 209             }
 210         }
 211         for my $k (@fields) {
 212             $SUM{$k} += $S{$m}{$k};
 213         }
 214     }
 215     if (0) {
 216     } else {
 217         printf ".........SUM %5d %5d %5d\n", $SUM{mblog}, $SUM{cpandbdelta}, $SUM{mblog}-$SUM{cpandbdelta};
 218     }
 219 } else {
 220     warn sprintf
 221         (
 222          "None of the %d records among the jsonfiles (\n%s) was matching pickdate '%s'",
 223          $delta_records,
 224          join("",map("  $_\n",@jsonfiles)),
 225          $Opt{pickdate},
 226         );
 227 }
 228
 229 # Local Variables:
 230 # mode: cperl
 231 # cperl-indent-level: 4
 232 # End: