remove dupes
[andk-cpan-tools.git] / bin / refill-cpanstatsdb.pl
blob4518b8cfcee616ea262601ae2fcc454a2835b1c5
1 #!/usr/bin/perl
3 # use 5.010;
4 use strict;
5 use warnings;
7 =head1 NAME
11 =head1 SYNOPSIS
13 ~/src/installed-perls/v5.16.0/4e6d/bin/perl bin/refill-cpanstatsdb.pl
15 =head1 OPTIONS
17 =over 8
19 =cut
21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
23 =item B<--help|h!>
25 This help
27 =item B<--finishlimit=i>
29 A query that yields a result with less rows than this number is the
30 signal to refrain from further refill queries and finish this program.
31 Defaults to 0 which means other limits are needed to stop this
32 program.
34 Note: before we invented the sleep parameters, this was the way how we
35 stopped the program. Probably not needed anymore.
37 =item B<--maxins=i>
39 No default, which means no limit. Maximum number of records to inject.
40 If set to zero, we test the surroundings, then exit.
42 =item B<--maxtime=i>
44 Maximum time in seconds this program should run. Defaults to 1770. If
45 set to zero, no limit.
47 =item B<--queryid=i>
49 Normally the database is asked for its max(id) and then the first
50 query is about one more than that with an open end. If --queryid is
51 specified, then we query that and only that and then finish the
52 program.
54 =item B<--sleeplimit=i>
56 A query that yields a result with less rows than this number is the
57 signal to sleep for $Opt{sleeptime} seconds before querying again.
58 Defaults to 500. Do not set it too low, it would produce an annoying
59 amount of logfiles.
61 =item B<--sleeptime=i>
63 For how long to sleep in the case of $Opt{sleeplimit} undercut.
64 Defaults to 150 seconds.
66 =back
68 =head1 DESCRIPTION
70 Replacement for the job that downloaded the whole cpanstats.db and
71 gunzipped it.
73 Now we simply repeatedly fetch the descriptions for the next 2500
74 reports until the supply dries out. Thus we reach a new max, write all
75 the stuff to the db and let the other jobs work from there.
77 =head1 TODO
79 remove unneeded data, maybe split them out.
81 =head1 SEE ALSO
83 refill-cpanstatsdb-minutes.pl
85 =cut
88 use FindBin;
89 use lib "$FindBin::Bin/../CPAN-Blame/lib";
90 use CPAN::Blame::Config::Cnntp;
92 use Dumpvalue;
93 use File::Basename ();
94 use File::Path ();
95 use File::Spec;
96 use File::Temp;
97 use Getopt::Long;
98 use Pod::Usage;
99 use Hash::Util qw(lock_keys);
100 use List::Util qw(min);
101 use lib "$FindBin::Bin/../CPAN-Blame/lib";
102 use IPC::ConcurrencyLimit;
103 use Redis;
105 our %Opt;
106 lock_keys %Opt, map { /([^=|!]+)/ } @opt;
107 GetOptions(\%Opt,
108 @opt,
109 ) or pod2usage(1);
110 if ($Opt{help}) {
111 pod2usage(0);
113 $Opt{finishlimit} ||= 0;
114 $Opt{sleeplimit} ||= 500;
115 $Opt{sleeptime} ||= 150;
116 $Opt{maxtime} = 1770 unless defined $Opt{maxtime};
118 my($workdir);
119 BEGIN {
120 $workdir = File::Spec->catdir
121 ($CPAN::Blame::Config::Cnntp::Config->{solver_vardir},
122 "workdir");
125 my($basename) = File::Basename::basename(__FILE__);
126 my $limit = IPC::ConcurrencyLimit->new
128 max_procs => 1,
129 path => "$workdir/IPC-ConcurrencyLimit-$basename",
131 my $limitid = $limit->get_lock;
132 if (not $limitid) {
133 warn "Another process appears to be still running. Exiting.";
134 exit(0);
137 use DBI;
138 use Time::HiRes qw(time);
139 use JSON::XS ();
140 use List::Util qw(max);
141 use CPAN::Testers::WWW::Reports::Query::Reports;
143 our $jsonxs = JSON::XS->new->indent(0);
144 our $redis = Redis->new(reconnect => 120, every => 1000);
146 my($pgdbh,$pgsth,$pgmaxid,$nextid);
147 $pgdbh = DBI->connect("dbi:Pg:dbname=analysis") or die "Could not connect to 'analysis': $DBI::err";
148 if ($Opt{queryid}) {
149 $pgmaxid = 0;
150 $nextid = $Opt{queryid};
151 } else {
152 my $sql = "select max(id) from cpanstats";
153 $pgsth = $pgdbh->prepare($sql);
155 my $rv = eval { $pgsth->execute(); };
156 unless ($rv) {
157 my $err = $pgsth->errstr;
158 die "Warning: error occurred while executing '$sql': $err";
161 my(@row) = $pgsth->fetchrow_array();
162 $pgmaxid = $row[0];
163 warn "INFO: In Pg found max id '$pgmaxid'";
164 $nextid = $pgmaxid+1;
167 my $sql = "INSERT INTO cpanstats
168 (id,guid,state,postdate,tester,dist,version,platform,perl,osname,osvers,fulldate,type) values
169 (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
170 $pgsth = $pgdbh->prepare($sql);
172 my $query = CPAN::Testers::WWW::Reports::Query::Reports->new;
173 my($inscount) = 0;
174 my($pg_n,$pg_time) = (0,0);
175 QUERY: while () {
176 my $range = $Opt{queryid} ? $nextid : "$nextid-";
177 warn sprintf "%s: Next query range '%s'\n", scalar gmtime(), $range;
178 my $result = $query->range($range);
179 my $querycnt = keys %$result;
180 my $thismax = $querycnt > 0 ? max(keys %$result) : undef;
181 warn sprintf "%s: Got %d records from '%s' to '%s'\n", scalar gmtime(), $querycnt, $nextid, $thismax||"<UNDEF>";
182 if (defined($Opt{maxins}) && $Opt{maxins} <= 0) {
183 last QUERY;
185 unless ($thismax){
186 if ($Opt{maxtime} && time+$Opt{sleeptime}-$^T >= $Opt{maxtime}) {
187 last QUERY;
188 } else {
189 sleep $Opt{sleeptime};
190 next QUERY;
194 # so we have some work to do
195 my @gmtime = gmtime;
196 my $logfile = sprintf
198 "%s/var/refill-cpanstatsdb/%04d/%02d/%04d%02d%02dT%02d%02d-%d-MAX.json.gz",
199 $ENV{HOME},
200 1900+$gmtime[5],
201 1+$gmtime[4],
202 1900+$gmtime[5],
203 1+$gmtime[4],
204 @gmtime[3,2,1],
205 $nextid,
207 File::Path::mkpath File::Basename::dirname $logfile;
208 if (-e $logfile) {
209 die "ALERT: found '$logfile', will not overwrite it";
211 open my $fh, "|-", "gzip -9c > $logfile" or die "Could not open gzip to '$logfile': $!";
212 my $next_log = time + 60;
213 # dist => "Attribute-Overload",
214 # fulldate => 201205262229,
215 # guid => "4454e538-a782-11e1-802a-3db30df65b4f",
216 # id => 22285792,
217 # osname => "linux",
218 # osvers => "2.6.18-1.2798.fc6",
219 # perl => "5.16.0 RC0",
220 # platform => "i686-linux-thread-multi-64int-ld",
221 # postdate => 201205,
222 # state => "fail",
223 # tester => "Khen1950fx\@aol.com",
224 # type => 2,
225 # version => "1.100710",
226 my $i = 0;
227 my $max_seen;
228 REC: for my $id (sort {$a <=> $b} keys %$result) {
229 if (defined($Opt{maxins}) && $inscount >= $Opt{maxins}) {
230 last REC;
232 if ($Opt{maxtime} && time-$^T >= $Opt{maxtime}) {
233 last REC;
235 $max_seen = $id;
236 my $record = $result->{$id};
237 if ($id > $pgmaxid) {
238 my $start = time;
239 $pgsth->execute($id,@{$record}{qw(guid state postdate tester dist version platform perl osname osvers fulldate type)});
240 $pg_n++;
241 $pg_time += time - $start;
243 my $distv = "$record->{dist}-$record->{version}";
244 $redis->sadd("analysis:distv:legalset",$distv);
245 #### hincrby not supported by our ubuntu redis
246 #### if ($record->{state} eq "pass") {
247 #### $redis->hincrby("analysis:distv:pass",$distv,1);
248 #### } elsif ($record->{state} eq "fail") {
249 #### $redis->hincrby("analysis:distv:fail",$distv,1);
250 #### }
251 # ddx $record; # see also Data::Dump line
252 print $fh $jsonxs->encode($record), "\n";
253 $i++;
254 if (time >= $next_log) {
255 warn sprintf "%s: %d records inserted\n", scalar gmtime(), $i;
256 $next_log += 60;
258 $inscount++;
260 close $fh or die "Could not close gzip to '$logfile': $!";
261 my $finallogfile = $logfile;
262 unless ($max_seen) {
263 $max_seen = $nextid - 1;
265 $finallogfile =~ s/MAX/$max_seen/;
266 rename $logfile, $finallogfile or die "Could not rename $logfile, $finallogfile: $!";
267 if ($Opt{queryid}) {
268 last QUERY;
270 if ( $Opt{finishlimit} && $querycnt < $Opt{finishlimit}) {
271 last QUERY;
273 if (defined($Opt{maxins}) && $inscount >= $Opt{maxins}) {
274 last QUERY;
276 my $sleeptime = 0;
277 if ( $Opt{sleeplimit} && $querycnt < $Opt{sleeplimit} ) {
278 $sleeptime = $Opt{sleeptime};
280 if ($Opt{maxtime} && time+$sleeptime-$^T >= $Opt{maxtime}) {
281 last QUERY;
283 if ($sleeptime) {
284 sleep $sleeptime;
286 $nextid = $thismax+1;
288 if ($pg_n) {
289 warn sprintf "STATS: pg avg ins time per rec %.5f\n", $pg_time/$pg_n;
292 # for the record: today I added the two:
293 # CREATE INDEX ixdist ON cpanstats (dist); # took ca 30 minutes
294 # CREATE INDEX ixtypestate ON cpanstats (type, state);
295 # DROP INDEX ixvers;
297 # Local Variables:
298 # mode: cperl
299 # cperl-indent-level: 4
300 # End: