13 ~/src/installed-perls/v5.34.0/c310/bin/perl bin/refill-cpanstatsdb.pl
21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
27 =item B<--finishlimit=i>
29 A query that yields a result with less rows than this number is the
30 signal to refrain from further refill queries and finish this program.
31 Defaults to 0 which means other limits are needed to stop this
34 Note: before we invented the --sleeplimit and --sleeptime parameters,
35 this was the way how we stopped the program. Probably not needed
38 =item B<--generate-redis-legalset!>
40 Boolean. Defaults to false. If true, writes something into Redis (RTFS).
44 No default, which means no limit. Maximum number of records to inject.
45 If set to zero, we test the surroundings, then exit.
49 Maximum time in seconds this program should run. Defaults to 1770. If
50 set to zero, no limit.
54 Normally the database is asked for its max(id) and then the first
55 query is about one more than that with an open end. If --queryid is
56 specified, then we query that and only that and then finish the
59 =item B<--sleeplimit=i>
61 A query that yields a result with less rows than this number is the
62 signal to sleep for $Opt{sleeptime} seconds before querying again.
63 Defaults to 500. Do not set it too low, it would produce an annoying
66 =item B<--sleeptime=i>
68 For how long to sleep in the case of $Opt{sleeplimit} undercut.
69 Defaults to 150 seconds.
71 =item B<--throttletime=i>
73 Defaults to 30. If > 0, we wait at the end of each batch if we were
74 faster than this parameter (in secs).
80 Replacement for the job that downloaded the whole cpanstats.db and
83 Now we simply repeatedly fetch the descriptions for the next 2500
84 reports until the supply dries out. Thus we reach a new max, write all
85 the stuff to the db and let the other jobs work from there.
89 remove unneeded data, maybe split them out.
93 refill-cpanstatsdb-minutes.pl
95 CPAN::Testers::WWW::Reports::Query::Reports
101 use lib
"$FindBin::Bin/../CPAN-Blame/lib";
102 use CPAN
::Blame
::Config
::Cnntp
;
105 use File
::Basename
();
111 use Hash
::Util
qw(lock_keys);
112 use List
::Util
qw(min);
113 use lib
"$FindBin::Bin/../CPAN-Blame/lib";
114 use IPC
::ConcurrencyLimit
;
118 lock_keys
%Opt, map { /([^=|!]+)/ } @opt;
125 $Opt{finishlimit
} ||= 0;
126 $Opt{sleeplimit
} ||= 500;
127 $Opt{sleeptime
} ||= 150;
128 $Opt{maxtime
} = 1770 unless defined $Opt{maxtime
};
129 $Opt{throttletime
} //= 30;
131 if ($Opt{"generate-redis-legalset"}) {
132 $HAVE_REDIS = eval { require Redis
; 1 };
136 die "you do not have Redis installed, but we need it to support generate-redis-legalset";
142 $workdir = File
::Spec
->catdir
143 ($CPAN::Blame
::Config
::Cnntp
::Config
->{solver_vardir
},
147 my($basename) = File
::Basename
::basename
(__FILE__
);
148 my $limit = IPC
::ConcurrencyLimit
->new
151 path
=> "$workdir/IPC-ConcurrencyLimit-$basename",
153 my $limitid = $limit->get_lock;
155 warn "Another process appears to be still running. Exiting.";
160 use Time
::HiRes
qw(time);
162 use List
::Util
qw(max);
163 use CPAN
::Testers
::WWW
::Reports
::Query
::Reports
;
165 our $jsonxs = JSON
::XS
->new->indent(0);
168 $redis = Redis
->new(reconnect
=> 120, every
=> 1000);
171 my($pgdbh,$pgsth,$pgmaxid,$nextid);
173 $pgdbh = DBI
->connect("dbi:Pg:dbname=analysis") or die "Could not connect to 'analysis': $DBI::err";
176 $nextid = $Opt{queryid
};
178 my @tables = map { /^public\.cpanstats_(\d{6})$/ ?
$1 : () } $pgdbh->tables("","public");
179 %Partitions = map { $_ => 1 } @tables;
181 for my $pd (sort @tables){
182 my $sth = $pgdbh->prepare("select max(id) from cpanstats_$pd where postdate=?");
184 my $max = $sth->fetchrow_array;
185 $Max = $max if $max > $Max;
187 warn "INFO: In Pg found max id '$Max'";
192 my $sql = "INSERT INTO cpanstats
193 (id,guid,state,postdate,tester,dist,version,platform,perl,osname,osvers,fulldate,type) values
194 (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
195 $pgsth = $pgdbh->prepare($sql);
197 # oldestpostdate, copied from genoldestpostdate.pl:
198 my($sth_ins,$sth_exists);
200 my $sql = "INSERT INTO oldestpostdate
201 (dist,version,postdate) VALUES
203 $sth_ins = $pgdbh->prepare($sql);
206 my $sql = "SELECT COUNT(*) FROM oldestpostdate WHERE dist=? AND version=?";
207 $sth_exists = $pgdbh->prepare($sql);
209 my $query = CPAN
::Testers
::WWW
::Reports
::Query
::Reports
->new;
211 my($pg_n,$pg_time) = (0,0);
213 my $range = $Opt{queryid
} ?
$nextid : "$nextid-";
214 my $batchstart = time;
215 warn sprintf "%s: Next query range '%s'\n", scalar gmtime(), $range;
216 my $result = $query->range($range);
217 my $RRL = ref $result->{list
}; # Ref of ResultList
218 my $querycnt = $RRL eq "HASH" ?
keys %{$result->{list
}} : @
{$result->{list
}};
219 my $thismax = $querycnt > 0 ?
($RRL eq "HASH" ?
(max
(keys %{$result->{list
}})) : $result->{list
}[-1]{id
}) : undef;
220 warn sprintf "%s: Got %d records from '%s' to '%s'\n", scalar gmtime(), $querycnt, $nextid, $thismax||"<UNDEF>";
221 if (defined($Opt{maxins
}) && $Opt{maxins
} <= 0) {
225 if ($Opt{maxtime
} && time+$Opt{sleeptime
}-$^T
>= $Opt{maxtime
}) {
228 sleep $Opt{sleeptime
};
233 # so we have some work to do
235 my $logfile = sprintf
237 "%s/var/refill-cpanstatsdb/%04d/%02d/%04d%02d%02dT%02d%02d-%d-MAX.json.gz",
246 File
::Path
::mkpath File
::Basename
::dirname
$logfile;
248 die "ALERT: found '$logfile', will not overwrite it";
250 open my $fh, "|-", "gzip -9c > $logfile" or die "Could not open gzip to '$logfile': $!";
251 binmode($fh, ":utf8");
252 my $next_log = time + 60;
253 # dist => "Attribute-Overload",
254 # fulldate => 201205262229,
255 # guid => "4454e538-a782-11e1-802a-3db30df65b4f",
258 # osvers => "2.6.18-1.2798.fc6",
259 # perl => "5.16.0 RC0",
260 # platform => "i686-linux-thread-multi-64int-ld",
261 # postdate => 201205,
263 # tester => "Khen1950fx\@aol.com",
265 # version => "1.100710",
268 REC
: for my $rec ( $RRL eq "HASH" ?
(sort {$a->{id
} <=> $b->{id
}} values %{$result->{list
}}) : @
{$result->{list
}}) {
269 if (defined($Opt{maxins
}) && $inscount >= $Opt{maxins
}) {
272 if ($Opt{maxtime
} && time-$^T
>= $Opt{maxtime
}) {
275 $max_seen = $rec->{id
} if $rec->{id
} > $max_seen;
277 my $id = $record->{id
};
278 if ($id > $pgmaxid) {
280 unless ($Partitions{$record->{postdate
}}){ # partition anlegen
281 $pgdbh->do("CREATE TABLE cpanstats_$record->{postdate} PARTITION OF cpanstats FOR VALUES IN (?)",undef,$record->{postdate
})
282 or die sprintf "Error while trying create cpan_%s: %s", $record->{postdate
}, $pgdbh->errstr;
283 warn "INFO: partition cpanstats_$record->{postdate} created\n";
284 $Partitions{$record->{postdate
}}++;
286 $record->{fulldate
} = sprintf "%04d-%02d-%02dT%02d:%02dz", unpack("a4a2a2a2a2", $record->{fulldate
});
287 $pgsth->execute($id,@
{$record}{qw(guid state postdate tester dist version platform perl osname osvers fulldate type)});
289 $pg_time += time - $start;
290 # oldestpostdate, copied from genoldestpostdate.pl:
291 my($dist,$version,$postdate) = @
{$record}{qw(dist version postdate)};
292 $sth_exists->execute($dist,"");
293 my($cnt0) = $sth_exists->fetchrow_array;
295 $sth_ins->execute($dist,"",$postdate);
297 $sth_exists->execute($dist,$version);
298 my($cnt1) = $sth_exists->fetchrow_array;
300 $sth_ins->execute($dist,$version,$postdate);
303 if ($Opt{"generate-redis-legalset"}) {
304 my $distv = "$record->{dist}-$record->{version}";
305 $redis->sadd("analysis:distv:legalset",$distv);
306 #### hincrby not supported by our ubuntu redis
307 #### if ($record->{state} eq "pass") {
308 #### $redis->hincrby("analysis:distv:pass",$distv,1);
309 #### } elsif ($record->{state} eq "fail") {
310 #### $redis->hincrby("analysis:distv:fail",$distv,1);
313 # ddx $record; # see also Data::Dump line
314 print $fh $jsonxs->encode($record), "\n";
316 if (time >= $next_log) {
317 warn sprintf "%s: %d records inserted\n", scalar gmtime(), $i;
322 close $fh or die "Could not close gzip to '$logfile': $!";
323 my $finallogfile = $logfile;
325 $max_seen = $nextid - 1;
327 $finallogfile =~ s/MAX/$max_seen/;
328 rename $logfile, $finallogfile or die "Could not rename $logfile, $finallogfile: $!";
332 if ( $Opt{finishlimit
} && $querycnt < $Opt{finishlimit
}) {
335 if (defined($Opt{maxins
}) && $inscount >= $Opt{maxins
}) {
339 if ( $Opt{sleeplimit
} && $querycnt < $Opt{sleeplimit
} ) {
340 $sleeptime = $Opt{sleeptime
};
342 if ($Opt{maxtime
} && time+$sleeptime-$^T
>= $Opt{maxtime
}) {
348 my $batchtook = time - $batchstart;
349 if ($Opt{throttletime
} and $batchtook < $Opt{throttletime
}) {
350 my $throttle_sleep = $Opt{throttletime
} - $batchtook;
351 sleep $throttle_sleep;
353 $nextid = $thismax+1;
356 warn sprintf "STATS: pg_time %.3f, recs written %d, pg avg ins time per rec %.5f\n", $pg_time, $pg_n, $pg_time/$pg_n;
359 # for the record: today I added the two:
360 # CREATE INDEX ixdist ON cpanstats (dist); # took ca 30 minutes
361 # CREATE INDEX ixtypestate ON cpanstats (type, state);
366 # cperl-indent-level: 4