fold duplicates into one line
[andk-cpan-tools.git] / bin / sync-sqlite-postgres.pl
blob548b1019b0cfded30087277a6d105b78bfcbc05d
1 #!/usr/bin/perl
3 # use 5.010;
4 use strict;
5 use warnings;
7 =head1 NAME
11 =head1 SYNOPSIS
15 =head1 OPTIONS
17 =over 8
19 =cut
21 my @opt = <<'=back' =~ /B<--(\S+)>/g;
23 =item B<--db=s>
25 DSN. Defaults to C<dbi:SQLite:dbname=$workdir/cpanstats.db> where
26 $workdir comes from the configuration.
28 =item B<--help|h!>
30 This help
32 =item B<--max=i>
34 Defaults to 360000. Number of records to transfer.
36 =back
38 =head1 DESCRIPTION
40 Just the cpanstats table. We consider already written records correct
41 and do not repeat them.
43 2013-11-16 akoenig : without throttling we seem to copy 100 records
44 per second. That would mean for 36M records: more than 4 days.
46 After a first run for 360000 records, it turned out to be slower, this
47 run took 4800 seconds. The next run for 2M records did t in 21400
48 seconds, so we were faster again.
50 =cut
53 use FindBin;
54 use lib "$FindBin::Bin/../lib";
55 BEGIN {
56 push @INC, qw( );
59 use Dumpvalue;
60 use File::Basename qw(dirname);
61 use File::Path qw(mkpath);
62 use File::Spec;
63 use File::Temp;
64 use Getopt::Long;
65 use Pod::Usage;
66 use Hash::Util qw(lock_keys);
68 our %Opt;
69 lock_keys %Opt, map { /([^=|!]+)/ } @opt;
70 GetOptions(\%Opt,
71 @opt,
72 ) or pod2usage(1);
73 if ($Opt{help}) {
74 pod2usage(0);
77 our $SIGNAL = 0;
78 $SIG{INT} = $SIG{TERM} = sub { my $sig = shift; warn "Caught $sig\n"; $SIGNAL=1 };
80 use DBI;
81 use Time::HiRes qw(sleep time);
82 use CPAN::Blame::Config::Cnntp;
83 my($workdir,$cpan_home,$ext_src);
84 BEGIN {
85 $workdir = File::Spec->catdir
86 ($CPAN::Blame::Config::Cnntp::Config->{solver_vardir},
87 "workdir");
88 $cpan_home = $CPAN::Blame::Config::Cnntp::Config->{cpan_home};
89 $ext_src = $CPAN::Blame::Config::Cnntp::Config->{ext_src};
92 $Opt{max} ||= 720000;
93 $Opt{db} ||= "dbi:SQLite:dbname=$workdir/cpanstats.db";
95 my $sldbi = DBI->connect ($Opt{db}); # return a dbi handle
96 my $pgdbi = DBI->connect ("dbi:Pg:dbname=analysis");
98 # $pgdbi->do("create table cpanstats (
99 # id INTEGER PRIMARY KEY,
100 # guid varchar(255),
101 # state varchar(255),
102 # postdate varchar(255),
103 # tester varchar(255),
104 # dist varchar(255),
105 # version varchar(255),
106 # platform varchar(255),
107 # perl varchar(255),
108 # osname varchar(255),
109 # osvers varchar(255),
110 # fulldate varchar(255),
111 # type INTEGER)
112 # ");
114 my $slsth = $sldbi->prepare("SELECT max(id) from cpanstats");
115 $slsth->execute();
116 my($slmaxid) = $slsth->fetchrow_array();
117 my $pgsth = $pgdbi->prepare("SELECT max(id) from cpanstats");
118 $pgsth->execute();
119 my($pgmaxid) = $pgsth->fetchrow_array();
120 $pgmaxid ||= 0;
121 warn "slmaxid[$slmaxid] pgmaxid[$pgmaxid] opt-max[$Opt{max}]\n";
122 die "Nothing to copy" if $pgmaxid >= $slmaxid;
124 $slsth = $sldbi->prepare("SELECT id,guid,state,postdate,tester,dist,version,platform,perl,osname,osvers,fulldate,type from cpanstats where id > ? order by id limit $Opt{max}");
125 $slsth->execute($pgmaxid);
126 $pgsth = $pgdbi->prepare("INSERT INTO cpanstats
127 (id,guid,state,postdate,tester,dist,version,platform,perl,osname,osvers,fulldate,type) values
128 (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
130 my $i = 0;
131 my $lastreport = my $starttime = time;
132 my $report_every = 1800;
133 ROW: while (my(@row) = $slsth->fetchrow_array) {
134 unless ($pgsth->execute(@row)) {
135 warn sprintf "ALERT: error inserting row/id[%s]: %s\n", $row[0], $pgdbi->errstr;
136 last ROW;
138 ++$i;
139 last ROW if $SIGNAL;
140 sleep 0.01 unless $i%1000;
141 if ( time - $lastreport > $report_every ) {
142 warn sprintf "Running for %.1f hours, reached i[%d]\n", (time-$^T)/3600, $i;
143 $lastreport = time;
146 my $tooktime = time - $starttime;
147 warn "records transferred[$i] tooktime[$tooktime]\n";
149 # Local Variables:
150 # mode: cperl
151 # cperl-indent-level: 4
152 # End: