new perl 5.41.4
[andk-cpan-tools.git] / bin / refill-cpanstatsdb.pl
blobe882ed6f7f045b144a61d3ec4e2c2be4675b337e
1 #!/usr/bin/perl
3 # use 5.010;
4 use strict;
5 use warnings;
7 =head1 NAME
11 =head1 SYNOPSIS
13 ~/src/installed-perls/v5.34.0/c310/bin/perl bin/refill-cpanstatsdb.pl
15 =head1 OPTIONS
17 =over 8
19 =cut
21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
23 =item B<--help|h!>
25 This help
27 =item B<--finishlimit=i>
29 A query that yields a result with less rows than this number is the
30 signal to refrain from further refill queries and finish this program.
31 Defaults to 0 which means other limits are needed to stop this
32 program.
34 Note: before we invented the --sleeplimit and --sleeptime parameters,
35 this was the way how we stopped the program. Probably not needed
36 anymore.
38 =item B<--generate-redis-legalset!>
40 Boolean. Defaults to false. If true, writes something into Redis (RTFS).
42 =item B<--maxins=i>
44 No default, which means no limit. Maximum number of records to inject.
45 If set to zero, we test the surroundings, then exit.
47 =item B<--maxtime=i>
49 Maximum time in seconds this program should run. Defaults to 1770. If
50 set to zero, no limit.
52 =item B<--queryid=i>
54 Normally the database is asked for its max(id) and then the first
55 query is about one more than that with an open end. If --queryid is
56 specified, then we query that and only that and then finish the
57 program.
59 =item B<--sleeplimit=i>
61 A query that yields a result with less rows than this number is the
62 signal to sleep for $Opt{sleeptime} seconds before querying again.
63 Defaults to 500. Do not set it too low, it would produce an annoying
64 amount of logfiles.
66 =item B<--sleeptime=i>
68 For how long to sleep in the case of $Opt{sleeplimit} undercut.
69 Defaults to 150 seconds.
71 =item B<--throttletime=i>
73 Defaults to 30. If > 0, we wait at the end of each batch if we were
74 faster than this parameter (in secs).
76 =back
78 =head1 DESCRIPTION
80 Replacement for the job that downloaded the whole cpanstats.db and
81 gunzipped it.
83 Now we simply repeatedly fetch the descriptions for the next 2500
84 reports until the supply dries out. Thus we reach a new max, write all
85 the stuff to the db and let the other jobs work from there.
87 =head1 TODO
89 remove unneeded data, maybe split them out.
91 =head1 SEE ALSO
93 refill-cpanstatsdb-minutes.pl
95 CPAN::Testers::WWW::Reports::Query::Reports
97 =cut
100 use FindBin;
101 use lib "$FindBin::Bin/../CPAN-Blame/lib";
102 use CPAN::Blame::Config::Cnntp;
104 use Dumpvalue;
105 use File::Basename ();
106 use File::Path ();
107 use File::Spec;
108 use File::Temp;
109 use Getopt::Long;
110 use Pod::Usage;
111 use Hash::Util qw(lock_keys);
112 use List::Util qw(min);
113 use lib "$FindBin::Bin/../CPAN-Blame/lib";
114 use IPC::ConcurrencyLimit;
115 our $HAVE_REDIS;
117 our %Opt;
118 lock_keys %Opt, map { /([^=|!]+)/ } @opt;
119 GetOptions(\%Opt,
120 @opt,
121 ) or pod2usage(1);
122 if ($Opt{help}) {
123 pod2usage(0);
125 $Opt{finishlimit} ||= 0;
126 $Opt{sleeplimit} ||= 500;
127 $Opt{sleeptime} ||= 150;
128 $Opt{maxtime} = 1770 unless defined $Opt{maxtime};
129 $Opt{throttletime} //= 30;
131 if ($Opt{"generate-redis-legalset"}) {
132 $HAVE_REDIS = eval { require Redis; 1 };
133 if ($HAVE_REDIS){
134 Redis->import();
135 } else {
136 die "you do not have Redis installed, but we need it to support generate-redis-legalset";
140 my($workdir);
141 BEGIN {
142 $workdir = File::Spec->catdir
143 ($CPAN::Blame::Config::Cnntp::Config->{solver_vardir},
144 "workdir");
147 my($basename) = File::Basename::basename(__FILE__);
148 my $limit = IPC::ConcurrencyLimit->new
150 max_procs => 1,
151 path => "$workdir/IPC-ConcurrencyLimit-$basename",
153 my $limitid = $limit->get_lock;
154 if (not $limitid) {
155 warn "Another process appears to be still running. Exiting.";
156 exit(0);
159 use DBI;
160 use Time::HiRes qw(time);
161 use JSON::XS ();
162 use List::Util qw(max);
163 use CPAN::Testers::WWW::Reports::Query::Reports;
165 our $jsonxs = JSON::XS->new->indent(0);
166 our $redis;
167 if ( $HAVE_REDIS ) {
168 $redis = Redis->new(reconnect => 120, every => 1000);
171 my($pgdbh,$pgsth,$pgmaxid,$nextid);
172 my %Partitions;
173 $pgdbh = DBI->connect("dbi:Pg:dbname=analysis") or die "Could not connect to 'analysis': $DBI::err";
174 if ($Opt{queryid}) {
175 $pgmaxid = 0;
176 $nextid = $Opt{queryid};
177 } else {
178 my @tables = map { /^public\.cpanstats_(\d{6})$/ ? $1 : () } $pgdbh->tables("","public");
179 %Partitions = map { $_ => 1 } @tables;
180 my $Max = 0;
181 for my $pd (sort @tables){
182 my $sth = $pgdbh->prepare("select max(id) from cpanstats_$pd where postdate=?");
183 $sth->execute($pd);
184 my $max = $sth->fetchrow_array;
185 $Max = $max if $max > $Max;
187 warn "INFO: In Pg found max id '$Max'";
188 $pgmaxid = $Max;
189 $nextid = $Max+1;
192 my $sql = "INSERT INTO cpanstats
193 (id,guid,state,postdate,tester,dist,version,platform,perl,osname,osvers,fulldate,type) values
194 (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
195 $pgsth = $pgdbh->prepare($sql);
197 # oldestpostdate, copied from genoldestpostdate.pl:
198 my($sth_ins,$sth_exists);
200 my $sql = "INSERT INTO oldestpostdate
201 (dist,version,postdate) VALUES
202 (?, ?, ?)";
203 $sth_ins = $pgdbh->prepare($sql);
206 my $sql = "SELECT COUNT(*) FROM oldestpostdate WHERE dist=? AND version=?";
207 $sth_exists = $pgdbh->prepare($sql);
209 my $query = CPAN::Testers::WWW::Reports::Query::Reports->new;
210 my($inscount) = 0;
211 my($pg_n,$pg_time) = (0,0);
212 QUERY: while () {
213 my $range = $Opt{queryid} ? $nextid : "$nextid-";
214 my $batchstart = time;
215 warn sprintf "%s: Next query range '%s'\n", scalar gmtime(), $range;
216 my $result = $query->range($range);
217 my $RRL = ref $result->{list}; # Ref of ResultList
218 my $querycnt = $RRL eq "HASH" ? keys %{$result->{list}} : @{$result->{list}};
219 my $thismax = $querycnt > 0 ? ($RRL eq "HASH" ? (max(keys %{$result->{list}})) : $result->{list}[-1]{id}) : undef;
220 warn sprintf "%s: Got %d records from '%s' to '%s'\n", scalar gmtime(), $querycnt, $nextid, $thismax||"<UNDEF>";
221 if (defined($Opt{maxins}) && $Opt{maxins} <= 0) {
222 last QUERY;
224 unless ($thismax){
225 if ($Opt{maxtime} && time+$Opt{sleeptime}-$^T >= $Opt{maxtime}) {
226 last QUERY;
227 } else {
228 sleep $Opt{sleeptime};
229 next QUERY;
233 # so we have some work to do
234 my @gmtime = gmtime;
235 my $logfile = sprintf
237 "%s/var/refill-cpanstatsdb/%04d/%02d/%04d%02d%02dT%02d%02d-%d-MAX.json.gz",
238 $ENV{HOME},
239 1900+$gmtime[5],
240 1+$gmtime[4],
241 1900+$gmtime[5],
242 1+$gmtime[4],
243 @gmtime[3,2,1],
244 $nextid,
246 File::Path::mkpath File::Basename::dirname $logfile;
247 if (-e $logfile) {
248 die "ALERT: found '$logfile', will not overwrite it";
250 open my $fh, "|-", "gzip -9c > $logfile" or die "Could not open gzip to '$logfile': $!";
251 binmode($fh, ":utf8");
252 my $next_log = time + 60;
253 # dist => "Attribute-Overload",
254 # fulldate => 201205262229,
255 # guid => "4454e538-a782-11e1-802a-3db30df65b4f",
256 # id => 22285792,
257 # osname => "linux",
258 # osvers => "2.6.18-1.2798.fc6",
259 # perl => "5.16.0 RC0",
260 # platform => "i686-linux-thread-multi-64int-ld",
261 # postdate => 201205,
262 # state => "fail",
263 # tester => "Khen1950fx\@aol.com",
264 # type => 2,
265 # version => "1.100710",
266 my $i = 0;
267 my $max_seen = 0;
268 REC: for my $rec ( $RRL eq "HASH" ? (sort {$a->{id} <=> $b->{id}} values %{$result->{list}}) : @{$result->{list}}) {
269 if (defined($Opt{maxins}) && $inscount >= $Opt{maxins}) {
270 last REC;
272 if ($Opt{maxtime} && time-$^T >= $Opt{maxtime}) {
273 last REC;
275 $max_seen = $rec->{id} if $rec->{id} > $max_seen;
276 my $record = $rec;
277 my $id = $record->{id};
278 if ($id > $pgmaxid) {
279 my $start = time;
280 unless ($Partitions{$record->{postdate}}){ # partition anlegen
281 $pgdbh->do("CREATE TABLE cpanstats_$record->{postdate} PARTITION OF cpanstats FOR VALUES IN (?)",undef,$record->{postdate})
282 or die sprintf "Error while trying create cpan_%s: %s", $record->{postdate}, $pgdbh->errstr;
283 warn "INFO: partition cpanstats_$record->{postdate} created\n";
284 $Partitions{$record->{postdate}}++;
286 $record->{fulldate} = sprintf "%04d-%02d-%02dT%02d:%02dz", unpack("a4a2a2a2a2", $record->{fulldate});
287 $pgsth->execute($id,@{$record}{qw(guid state postdate tester dist version platform perl osname osvers fulldate type)});
288 $pg_n++;
289 $pg_time += time - $start;
290 # oldestpostdate, copied from genoldestpostdate.pl:
291 my($dist,$version,$postdate) = @{$record}{qw(dist version postdate)};
292 $sth_exists->execute($dist,"");
293 my($cnt0) = $sth_exists->fetchrow_array;
294 unless ($cnt0) {
295 $sth_ins->execute($dist,"",$postdate);
297 $sth_exists->execute($dist,$version);
298 my($cnt1) = $sth_exists->fetchrow_array;
299 unless ($cnt1) {
300 $sth_ins->execute($dist,$version,$postdate);
303 if ($Opt{"generate-redis-legalset"}) {
304 my $distv = "$record->{dist}-$record->{version}";
305 $redis->sadd("analysis:distv:legalset",$distv);
306 #### hincrby not supported by our ubuntu redis
307 #### if ($record->{state} eq "pass") {
308 #### $redis->hincrby("analysis:distv:pass",$distv,1);
309 #### } elsif ($record->{state} eq "fail") {
310 #### $redis->hincrby("analysis:distv:fail",$distv,1);
311 #### }
313 # ddx $record; # see also Data::Dump line
314 print $fh $jsonxs->encode($record), "\n";
315 $i++;
316 if (time >= $next_log) {
317 warn sprintf "%s: %d records inserted\n", scalar gmtime(), $i;
318 $next_log += 60;
320 $inscount++;
322 close $fh or die "Could not close gzip to '$logfile': $!";
323 my $finallogfile = $logfile;
324 unless ($max_seen) {
325 $max_seen = $nextid - 1;
327 $finallogfile =~ s/MAX/$max_seen/;
328 rename $logfile, $finallogfile or die "Could not rename $logfile, $finallogfile: $!";
329 if ($Opt{queryid}) {
330 last QUERY;
332 if ( $Opt{finishlimit} && $querycnt < $Opt{finishlimit}) {
333 last QUERY;
335 if (defined($Opt{maxins}) && $inscount >= $Opt{maxins}) {
336 last QUERY;
338 my $sleeptime = 0;
339 if ( $Opt{sleeplimit} && $querycnt < $Opt{sleeplimit} ) {
340 $sleeptime = $Opt{sleeptime};
342 if ($Opt{maxtime} && time+$sleeptime-$^T >= $Opt{maxtime}) {
343 last QUERY;
345 if ($sleeptime) {
346 sleep $sleeptime;
348 my $batchtook = time - $batchstart;
349 if ($Opt{throttletime} and $batchtook < $Opt{throttletime}) {
350 my $throttle_sleep = $Opt{throttletime} - $batchtook;
351 sleep $throttle_sleep;
353 $nextid = $thismax+1;
355 if ($pg_n) {
356 warn sprintf "STATS: pg_time %.3f, recs written %d, pg avg ins time per rec %.5f\n", $pg_time, $pg_n, $pg_time/$pg_n;
359 # for the record: today I added the two:
360 # CREATE INDEX ixdist ON cpanstats (dist); # took ca 30 minutes
361 # CREATE INDEX ixtypestate ON cpanstats (type, state);
362 # DROP INDEX ixvers;
364 # Local Variables:
365 # mode: cperl
366 # cperl-indent-level: 4
367 # End: