bin/refill-cpanstatsdb.pl

   1 #!/usr/bin/perl
   2
   3 # use 5.010;
   4 use strict;
   5 use warnings;
   6
   7 =head1 NAME
   8
   9
  10
  11 =head1 SYNOPSIS
  12
  13   ~/src/installed-perls/v5.16.0/4e6d/bin/perl bin/refill-cpanstatsdb.pl
  14
  15 =head1 OPTIONS
  16
  17 =over 8
  18
  19 =cut
  20
  21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
  22
  23 =item B<--help|h!>
  24
  25 This help
  26
  27 =item B<--finishlimit=i>
  28
  29 A query that yields a result with less rows than this number is the
  30 signal to refrain from further refill queries and finish this program.
  31 Defaults to 0 which means other limits are needed to stop this
  32 program.
  33
  34 Note: before we invented the sleep parameters, this was the way how we
  35 stopped the program. Probably not needed anymore.
  36
  37 =item B<--maxins=i>
  38
  39 No default, which means no limit. Maximum number of records to inject.
  40 If set to zero, we test the surroundings, then exit.
  41
  42 =item B<--maxtime=i>
  43
  44 Maximum time in seconds this program should run. Defaults to 1770. If
  45 set to zero, no limit.
  46
  47 =item B<--queryid=i>
  48
  49 Normally the database is asked for its max(id) and then the first
  50 query is about one more than that with an open end. If --queryid is
  51 specified, then we query that and only that and then finish the
  52 program.
  53
  54 =item B<--sleeplimit=i>
  55
  56 A query that yields a result with less rows than this number is the
  57 signal to sleep for $Opt{sleeptime} seconds before querying again.
  58 Defaults to 500. Do not set it too low, it would produce an annoying
  59 amount of logfiles.
  60
  61 =item B<--sleeptime=i>
  62
  63 For how long to sleep in the case of $Opt{sleeplimit} undercut.
  64 Defaults to 150 seconds.
  65
  66 =back
  67
  68 =head1 DESCRIPTION
  69
  70 Replacement for the job that downloaded the whole cpanstats.db and
  71 gunzipped it.
  72
  73 Now we simply repeatedly fetch the descriptions for the next 2500
  74 reports until the supply dries out. Thus we reach a new max, write all
  75 the stuff to the db and let the other jobs work from there.
  76
  77 =head1 TODO
  78
  79 remove unneeded data, maybe split them out.
  80
  81 =head1 SEE ALSO
  82
  83 refill-cpanstatsdb-minutes.pl
  84
  85 =cut
  86
  87
  88 use FindBin;
  89 use lib "$FindBin::Bin/../CPAN-Blame/lib";
  90 use CPAN::Blame::Config::Cnntp;
  91
  92 use Dumpvalue;
  93 use File::Basename ();
  94 use File::Path ();
  95 use File::Spec;
  96 use File::Temp;
  97 use Getopt::Long;
  98 use Pod::Usage;
  99 use Hash::Util qw(lock_keys);
 100 use List::Util qw(min);
 101 use lib "$FindBin::Bin/../CPAN-Blame/lib";
 102 use IPC::ConcurrencyLimit;
 103 use Redis;
 104
 105 our %Opt;
 106 lock_keys %Opt, map { /([^=|!]+)/ } @opt;
 107 GetOptions(\%Opt,
 108            @opt,
 109           ) or pod2usage(1);
 110 if ($Opt{help}) {
 111     pod2usage(0);
 112 }
 113 $Opt{finishlimit} ||= 0;
 114 $Opt{sleeplimit} ||= 500;
 115 $Opt{sleeptime} ||= 150;
 116 $Opt{maxtime} = 1770 unless defined $Opt{maxtime};
 117
 118 my($workdir);
 119 BEGIN {
 120     $workdir = File::Spec->catdir
 121         ($CPAN::Blame::Config::Cnntp::Config->{solver_vardir},
 122          "workdir");
 123 }
 124
 125 my($basename) = File::Basename::basename(__FILE__);
 126 my $limit = IPC::ConcurrencyLimit->new
 127     (
 128      max_procs => 1,
 129      path      => "$workdir/IPC-ConcurrencyLimit-$basename",
 130     );
 131 my $limitid = $limit->get_lock;
 132 if (not $limitid) {
 133     warn "Another process appears to be still running. Exiting.";
 134     exit(0);
 135 }
 136
 137 use DBI;
 138 use Time::HiRes qw(time);
 139 use JSON::XS ();
 140 use List::Util qw(max);
 141 use CPAN::Testers::WWW::Reports::Query::Reports;
 142
 143 our $jsonxs = JSON::XS->new->indent(0);
 144 our $redis = Redis->new(reconnect => 120, every => 1000);
 145
 146 my($pgdbh,$pgsth,$pgmaxid,$nextid);
 147 $pgdbh = DBI->connect("dbi:Pg:dbname=analysis") or die "Could not connect to 'analysis': $DBI::err";
 148 if ($Opt{queryid}) {
 149     $pgmaxid = 0;
 150     $nextid = $Opt{queryid};
 151 } else {
 152     my $sql = "select max(id) from cpanstats";
 153     $pgsth = $pgdbh->prepare($sql);
 154     {
 155         my $rv = eval { $pgsth->execute(); };
 156         unless ($rv) {
 157             my $err = $pgsth->errstr;
 158             die "Warning: error occurred while executing '$sql': $err";
 159         }
 160     }
 161     my(@row) = $pgsth->fetchrow_array();
 162     $pgmaxid = $row[0];
 163     warn "INFO: In Pg found max id '$pgmaxid'";
 164     $nextid = $pgmaxid+1;
 165 }
 166 {
 167     my $sql = "INSERT INTO cpanstats
 168  (id,guid,state,postdate,tester,dist,version,platform,perl,osname,osvers,fulldate,type) values
 169  (?, ?,   ?,    ?,       ?,     ?,   ?,      ?,       ?,   ?,     ?,     ?,       ?)";
 170     $pgsth = $pgdbh->prepare($sql);
 171 }
 172 my $query = CPAN::Testers::WWW::Reports::Query::Reports->new;
 173 my($inscount) = 0;
 174 my($pg_n,$pg_time) = (0,0);
 175 QUERY: while () {
 176     my $range = $Opt{queryid} ? $nextid : "$nextid-";
 177     warn sprintf "%s: Next query range '%s'\n", scalar gmtime(), $range;
 178     my $result = $query->range($range);
 179     my $querycnt = keys %$result;
 180     my $thismax = $querycnt > 0 ? max(keys %$result) : undef;
 181     warn sprintf "%s: Got %d records from '%s' to '%s'\n", scalar gmtime(), $querycnt, $nextid, $thismax||"<UNDEF>";
 182     if (defined($Opt{maxins}) && $Opt{maxins} <= 0) {
 183         last QUERY;
 184     }
 185     unless ($thismax){
 186         if ($Opt{maxtime} && time+$Opt{sleeptime}-$^T >= $Opt{maxtime}) {
 187             last QUERY;
 188         } else {
 189             sleep $Opt{sleeptime};
 190             next QUERY;
 191         }
 192     }
 193
 194     # so we have some work to do
 195     my @gmtime = gmtime;
 196     my $logfile = sprintf
 197         (
 198          "%s/var/refill-cpanstatsdb/%04d/%02d/%04d%02d%02dT%02d%02d-%d-MAX.json.gz",
 199          $ENV{HOME},
 200          1900+$gmtime[5],
 201          1+$gmtime[4],
 202          1900+$gmtime[5],
 203          1+$gmtime[4],
 204          @gmtime[3,2,1],
 205          $nextid,
 206         );
 207     File::Path::mkpath File::Basename::dirname $logfile;
 208     if (-e $logfile) {
 209         die "ALERT: found '$logfile', will not overwrite it";
 210     }
 211     open my $fh, "|-", "gzip -9c > $logfile" or die "Could not open gzip to '$logfile': $!";
 212     my $next_log = time + 60;
 213     #   dist     => "Attribute-Overload",
 214     #   fulldate => 201205262229,
 215     #   guid     => "4454e538-a782-11e1-802a-3db30df65b4f",
 216     #   id       => 22285792,
 217     #   osname   => "linux",
 218     #   osvers   => "2.6.18-1.2798.fc6",
 219     #   perl     => "5.16.0 RC0",
 220     #   platform => "i686-linux-thread-multi-64int-ld",
 221     #   postdate => 201205,
 222     #   state    => "fail",
 223     #   tester   => "Khen1950fx\@aol.com",
 224     #   type     => 2,
 225     #   version  => "1.100710",
 226     my $i = 0;
 227     my $max_seen;
 228  REC: for my $id (sort {$a <=> $b} keys %$result) {
 229         if (defined($Opt{maxins}) && $inscount >= $Opt{maxins}) {
 230             last REC;
 231         }
 232         if ($Opt{maxtime} && time-$^T >= $Opt{maxtime}) {
 233             last REC;
 234         }
 235         $max_seen = $id;
 236         my $record = $result->{$id};
 237         if ($id > $pgmaxid) {
 238             my $start = time;
 239             $pgsth->execute($id,@{$record}{qw(guid state postdate tester dist version platform perl osname osvers fulldate type)});
 240             $pg_n++;
 241             $pg_time += time - $start;
 242         }
 243         my $distv = "$record->{dist}-$record->{version}";
 244         $redis->sadd("analysis:distv:legalset",$distv);
 245         #### hincrby not supported by our ubuntu redis
 246         #### if ($record->{state} eq "pass") {
 247         ####     $redis->hincrby("analysis:distv:pass",$distv,1);
 248         #### } elsif ($record->{state} eq "fail") {
 249         ####     $redis->hincrby("analysis:distv:fail",$distv,1);
 250         #### }
 251         # ddx $record; # see also Data::Dump line
 252         print $fh $jsonxs->encode($record), "\n";
 253         $i++;
 254         if (time >= $next_log) {
 255             warn sprintf "%s: %d records inserted\n", scalar gmtime(), $i;
 256             $next_log += 60;
 257         }
 258         $inscount++;
 259     }
 260     close $fh or die "Could not close gzip to '$logfile': $!";
 261     my $finallogfile = $logfile;
 262     unless ($max_seen) {
 263         $max_seen = $nextid - 1;
 264     }
 265     $finallogfile =~ s/MAX/$max_seen/;
 266     rename $logfile, $finallogfile or die "Could not rename $logfile, $finallogfile: $!";
 267     if ($Opt{queryid}) {
 268         last QUERY;
 269     }
 270     if ( $Opt{finishlimit} && $querycnt < $Opt{finishlimit}) {
 271         last QUERY;
 272     }
 273     if (defined($Opt{maxins}) && $inscount >= $Opt{maxins}) {
 274         last QUERY;
 275     }
 276     my $sleeptime = 0;
 277     if ( $Opt{sleeplimit} && $querycnt < $Opt{sleeplimit} ) {
 278         $sleeptime = $Opt{sleeptime};
 279     }
 280     if ($Opt{maxtime} && time+$sleeptime-$^T >= $Opt{maxtime}) {
 281         last QUERY;
 282     }
 283     if ($sleeptime) {
 284         sleep $sleeptime;
 285     }
 286     $nextid = $thismax+1;
 287 }
 288 if ($pg_n) {
 289     warn sprintf "STATS: pg avg ins time per rec %.5f\n", $pg_time/$pg_n;
 290 }
 291
 292 # for the record: today I added the two:
 293 # CREATE INDEX ixdist ON cpanstats (dist);  # took ca 30 minutes
 294 # CREATE INDEX ixtypestate ON cpanstats (type, state);
 295 # DROP INDEX ixvers;
 296
 297 # Local Variables:
 298 # mode: cperl
 299 # cperl-indent-level: 4
 300 # End: