remove debugging noise
[rersyncrecent.git] / lib / File / Rsync / Mirror / Recent.pm
blobe48d2662292e88c814218c91e68f23010badd8c0
1 package File::Rsync::Mirror::Recent;
3 # use warnings;
4 use strict;
5 use File::Rsync::Mirror::Recentfile;
7 =encoding utf-8
9 =head1 NAME
11 File::Rsync::Mirror::Recent - mirroring via rsync made efficient
13 =cut
15 package File::Rsync::Mirror::Recent;
17 use File::Basename qw(basename dirname fileparse);
18 use File::Copy qw(cp);
19 use File::Path qw(mkpath);
20 use File::Rsync;
21 use File::Rsync::Mirror::Recentfile::Done (); # at least needed by thaw()
22 use File::Rsync::Mirror::Recentfile::FakeBigFloat qw(:all);
23 use File::Temp;
24 use List::Pairwise qw(mapp grepp);
25 use List::Util qw(first max);
26 use Scalar::Util qw(blessed reftype);
27 use Storable;
28 use Time::HiRes qw();
29 use YAML::Syck;
31 use version; our $VERSION = qv('0.0.8');
33 =head1 SYNOPSIS
35 The documentation in here is normally not needed because the code is
36 considered to be run from several standalone programs. For a quick
37 overview, see the file README.mirrorcpan and the bin/ directory of the
38 distribution. For the architectural ideas see the section THE
39 ARCHITECTURE OF A COLLECTION OF RECENTFILES below.
41 File::Rsync::Mirror::Recent establishes a view on a collection of
42 File::Rsync::Mirror::Recentfile objects and provides abstractions
43 spanning multiple time intervals associated with those.
45 =head1 EXPORT
47 No exports.
49 =head1 CONSTRUCTORS
51 =head2 my $obj = CLASS->new(%hash)
53 Constructor. On every argument pair the key is a method name and the
54 value is an argument to that method name.
56 =cut
58 sub new {
59 my($class, @args) = @_;
60 my $self = bless {}, $class;
61 while (@args) {
62 my($method,$arg) = splice @args, 0, 2;
63 $self->$method($arg);
65 return $self;
68 =head2 my $obj = CLASS->thaw($statusfile)
70 Constructor from a statusfile left over from a previous
71 rmirror run. See also C<runstatusfile>.
73 =cut
75 sub _thaw_without_pathdb {
76 my($self,$file) = @_;
77 open my $fh, $file or die "Can't open '$file': $!";
78 local $/ = "\n";
79 my $in_pathdb = 0;
80 my $tfile = File::Temp->new
82 TEMPLATE => "Recent-thaw-XXXX",
83 TMPDIR => 1,
84 UNLINK => 0,
85 CLEANUP => 0,
86 SUFFIX => '.dat',
88 my $template_for_eop;
89 while (<$fh>) {
90 if ($in_pathdb) {
91 if (/$template_for_eop/) {
92 $in_pathdb = 0;
94 } elsif (/(\s+)-\s*__pathdb\s*:/) {
95 $in_pathdb = 1;
96 my $next_attr = sprintf "^%s\\S", " ?" x length($1);
97 $template_for_eop = qr{$next_attr};
99 print $tfile $_ unless $in_pathdb;
101 close $tfile or die "Could not close: $!";
102 my $return = $self->thaw($tfile->filename);
103 $return->_havelostpathdb(1);
104 unlink $tfile->filename;
105 return $return;
107 sub thaw {
108 my($self, $file) = @_;
109 die "thaw called without statusfile argument" unless defined $file;
110 unless (-e $file){
111 require Carp;
112 Carp::confess("Alert: statusfile '$file' not found");
114 require YAML::Syck;
115 my $start = time;
116 my $sleeptime = 0.02;
117 while (not mkdir "$file.lock") {
118 my $err = $!;
119 Time::HiRes::sleep $sleeptime;
120 my $waiting = time - $start;
121 if ($waiting >= 3){
122 warn "*** waiting ($waiting) for lock ($err) ***";
123 $sleeptime = 1;
126 my $size = -s $file;
127 my $serialized = YAML::Syck::LoadFile($file);
128 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
129 my $charged_self = $serialized->{reduced_self};
130 my $class = blessed $self;
131 bless $charged_self, $class;
132 my $rfs = $serialized->{reduced_rfs};
133 my $rfclass = $class . "file"; # "Recent" . "file"
134 my $pathdb = $charged_self->_pathdb;
135 for my $rf (@$rfs) {
136 bless $rf, $rfclass;
137 $rf->_pathdb($pathdb);
139 $charged_self->_recentfiles($rfs);
140 $charged_self->_principal_recentfile($rfs->[0]);
141 # die "FIXME: thaw all recentfiles from reduced_rfs into _recentfiles as well, watch out for pathdb and rsync";
142 return $charged_self;
145 =head1 ACCESSORS
147 =cut
149 my @accessors;
151 BEGIN {
152 @accessors =
154 "__pathdb",
155 "_dirtymark", # keeps track of the dirtymark of the recentfiles
156 "_havelostpathdb", # boolean
157 "_have_written_statusfile", # boolean
158 "_logfilefordone", # turns on _logfile on all DONE
159 # systems (disk intensive)
160 "_max_one_state", # when we have no time left but want
161 # at least get one file per
162 # iteration to avoid procrastination
163 "_principal_recentfile",
164 "_recentfiles",
165 "_rsync",
166 "_runstatusfile", # occasionally dumps all rfs
167 "_verbose", # internal variable for verbose setter/getter
168 "_verboselog", # internal variable for verboselog setter/getter
171 my @pod_lines =
172 split /\n/, <<'=cut'; push @accessors, grep {s/^=item\s+//} @pod_lines; }
174 =over 4
176 =item ignore_link_stat_errors
178 as in F:R:M:Recentfile
180 =item local
182 Option to specify the local principal file for operations with a local
183 collection of recentfiles.
185 =item localroot
187 as in F:R:M:Recentfile
189 =item max_files_per_connection
191 as in F:R:M:Recentfile
193 =item remote
195 The remote principal recentfile in rsync notation. E.g.
197 pause.perl.org::authors/RECENT.recent
199 =item remoteroot
201 as in F:R:M:Recentfile
203 =item remote_recentfile
205 Rsync address of the remote C<RECENT.recent> symlink or whichever name
206 the principal remote recentfile has.
208 =item rsync_options
210 Things like compress, links, times or checksums. Passed in to the
211 File::Rsync object used to run the mirror.
213 =item tempdir
215 as in F:R:M:Recentfile
217 =item ttl
219 Minimum time before fetching the principal recentfile again.
221 =back
223 =cut
225 use accessors @accessors;
227 =head1 METHODS
229 =head2 $arrayref = $obj->news ( %options )
231 Test this with:
233 perl -Ilib bin/rrr-news \
234 -after 1217200539 \
235 -max 12 \
236 -local /home/ftp/pub/PAUSE/authors/RECENT.recent
238 perl -Ilib bin/rrr-news \
239 -after 1217200539 \
240 -rsync=compress=1 \
241 -rsync=links=1 \
242 -localroot /home/ftp/pub/PAUSE/authors/ \
243 -remote pause.perl.org::authors/RECENT.recent
244 -verbose
246 Note: all parameters that can be passed to recent_events can also be specified here.
248 Note: all data are kept in memory
250 =cut
252 sub news {
253 my($self, %opt) = @_;
254 my $local = $self->local;
255 unless ($local) {
256 if (my $remote = $self->remote) {
257 my $localroot;
258 if ($localroot = $self->localroot) {
259 # nice, they know what they are doing
260 } else {
261 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
263 } else {
264 die "Alert: neither local nor remote specified, cannot continue";
267 my $rfs = $self->recentfiles;
268 my $ret = [];
269 my $before;
270 for my $rf (@$rfs) {
271 my %locopt = %opt;
272 $locopt{before} = $before;
273 if ($opt{max}) {
274 $locopt{max} -= scalar @$ret;
275 last if $locopt{max} <= 0;
277 $locopt{info} = {};
278 my $res = $rf->recent_events(%locopt);
279 if (@$res){
280 push @$ret, @$res;
282 if ($opt{max} && scalar @$ret > $opt{max}) {
283 last;
285 if ($opt{after}){
286 if ( $locopt{info}{last} && _bigfloatlt($locopt{info}{last}{epoch},$opt{after}) ) {
287 last;
289 if ( _bigfloatgt($opt{after},$locopt{info}{first}{epoch}) ) {
290 last;
293 if (!@$res){
294 next;
296 $before = $res->[-1]{epoch};
297 $before = $opt{before} if $opt{before} && _bigfloatlt($opt{before},$before);
299 $ret;
302 =head2 overview ( %options )
304 returns a small table that summarizes the state of all recentfiles
305 collected in this Recent object.
307 $options{verbose}=1 increases the number of columns displayed.
309 Here is an example output:
311 Ival Cnt Max Min Span Util Cloud
312 1h 47 1225053014.38 1225049650.91 3363.47 93.4% ^ ^
313 6h 324 1225052939.66 1225033394.84 19544.82 90.5% ^ ^
314 1d 437 1225049651.53 1224966402.53 83248.99 96.4% ^ ^
315 1W 1585 1225039015.75 1224435339.46 603676.29 99.8% ^ ^
316 1M 5855 1225017376.65 1222428503.57 2588873.08 99.9% ^ ^
317 1Q 17066 1224578930.40 1216803512.90 7775417.50 100.0% ^ ^
318 1Y 15901 1223966162.56 1216766820.67 7199341.89 22.8% ^ ^
319 Z 9909 1223966162.56 1216766820.67 7199341.89 - ^ ^
321 I<Max> is the name of the interval.
323 I<Cnt> is the number of entries in this recentfile.
325 I<Max> is the highest(first) epoch in this recentfile, rounded.
327 I<Min> is the lowest(last) epoch in this recentfile, rounded.
329 I<Span> is the timespan currently covered, rounded.
331 I<Util> is I<Span> devided by the designated timespan of this
332 recentfile.
334 I<Cloud> is ascii art illustrating the sequence of the Max and Min
335 timestamps.
337 =cut
338 sub overview {
339 my($self,%options) = @_;
340 my $rfs = $self->recentfiles;
341 my(@s,%rank);
342 RECENTFILE: for my $rf (@$rfs) {
343 my $re=$rf->recent_events;
344 my $rfsummary;
345 if (@$re) {
346 my $span = $re->[0]{epoch}-$re->[-1]{epoch};
347 my $merged = $rf->merged;
348 $rfsummary =
350 "Ival",
351 $rf->interval,
352 "Cnt",
353 scalar @$re,
354 "Dirtymark",
355 $rf->dirtymark ? sprintf("%.2f",$rf->dirtymark) : "-",
356 "Produced",
357 sprintf ("%.2f", $rf->{ORIG}{Producers}{time}||0),
358 "Merged",
359 ($rf->interval eq "Z"
363 sprintf ("%.2f", $merged->{epoch} || 0)),
364 "Max",
365 sprintf ("%.2f", $re->[0]{epoch}),
366 "Min",
367 sprintf ("%.2f", $re->[-1]{epoch}),
368 "Span",
369 sprintf ("%.2f", $span),
370 "Util", # u9n:)
371 ($rf->interval eq "Z"
375 sprintf ("%5.1f%%", 100 * $span / $rf->interval_secs)
378 @rank{mapp {$b} grepp {$a =~ /^(Max|Min)$/} @$rfsummary} = ();
379 } else {
380 next RECENTFILE;
382 push @s, $rfsummary;
384 @rank{sort {$b <=> $a} keys %rank} = 1..keys %rank;
385 my $maxrank = max values %rank;
386 for my $rfsummary (@s) {
387 my $string = " " x $maxrank;
388 my @borders;
389 for my $ele (qw(Max Min)) {
390 my($r) = mapp {$b} grepp {$a eq $ele} @$rfsummary;
391 push @borders, $rank{$r}-1;
393 for ($borders[0],$borders[1]) {
394 substr($string,$_,1) = "^";
396 push @$rfsummary, "Cloud", $string;
398 unless ($options{verbose}) {
399 my %filter = map {($_=>1)} qw(Ival Cnt Max Min Span Util Cloud);
400 for (@s) {
401 $_ = [mapp {($a,$b)} grepp {!!$filter{$a}} @$_];
404 my @sprintf;
405 for (my $i = 0; $i <= $#{$s[0]}; $i+=2) {
406 my $maxlength = max ((map { length $_->[$i+1] } @s), length $s[0][$i]);
407 push @sprintf, "%" . $maxlength . "s";
409 my $sprintf = join " ", @sprintf;
410 $sprintf .= "\n";
411 my $headline = sprintf $sprintf, mapp {$a} @{$s[0]};
412 join "", $headline, map { sprintf $sprintf, mapp {$b} @$_ } @s;
415 =head2 _pathdb
417 Keeping track of already handled files. Currently it is a hash, will
418 probably become a database with its own accessors.
420 =cut
422 sub _pathdb {
423 my($self, $set) = @_;
424 if ($set) {
425 $self->__pathdb ($set);
427 my $pathdb = $self->__pathdb;
428 unless (defined $pathdb) {
429 $self->__pathdb(+{});
431 return $self->__pathdb;
434 =head2 $recentfile = $obj->principal_recentfile ()
436 returns the principal recentfile object of this tree.
438 =cut
439 # mirrors the recentfile and instantiates the recentfile object
440 sub _principal_recentfile_fromremote {
441 my($self) = @_;
442 # get the remote recentfile
443 my $rrfile = $self->remote or die "Alert: cannot construct a recentfile object without the 'remote' attribute";
444 my $splitter = qr{(.+)/([^/]*)};
445 my($remoteroot,$rfilename) = $rrfile =~ $splitter;
446 $self->remoteroot($remoteroot);
447 my($abslfile, $fh);
448 if (!defined $rfilename) {
449 die "Alert: Cannot resolve '$rrfile', does not match $splitter";
450 } elsif (not length $rfilename or $rfilename eq "RECENT.recent") {
451 ($abslfile,$rfilename,$fh) = $self->_principal_recentfile_fromremote_resosymlink($rfilename);
453 my @need_args =
455 "ignore_link_stat_errors",
456 "localroot",
457 "max_files_per_connection",
458 "remoteroot",
459 "rsync_options",
460 "tempdir",
461 "ttl",
462 "verbose",
463 "verboselog",
465 my $rf0;
466 unless ($abslfile) {
467 $rf0 = File::Rsync::Mirror::Recentfile->new (map {($_ => $self->$_)} @need_args);
468 $rf0->split_rfilename($rfilename);
469 $abslfile = $rf0->get_remote_recentfile_as_tempfile ();
471 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ( $abslfile );
472 $rf0->_current_tempfile ( $abslfile );
473 $rf0->_current_tempfile_fh ( $fh );
474 $rf0->_use_tempfile (1);
475 for my $override (@need_args) {
476 $rf0->$override ( $self->$override );
478 $rf0->is_slave (1);
479 return $rf0;
481 sub principal_recentfile {
482 my($self) = @_;
483 my $rf0 = $self->_principal_recentfile;
484 return $rf0 if defined $rf0;
485 my $local = $self->local;
486 if ($local) {
487 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ($local);
488 } else {
489 if (my $remote = $self->remote) {
490 my $localroot;
491 if ($localroot = $self->localroot) {
492 # nice, they know what they are doing
493 } else {
494 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
496 $rf0 = $self->_principal_recentfile_fromremote;
497 } else {
498 die "Alert: neither local nor remote specified, cannot continue";
501 $self->_principal_recentfile($rf0);
502 return $rf0;
505 =head2 $recentfiles_arrayref = $obj->recentfiles ()
507 returns a reference to the complete list of recentfile objects that
508 describe this tree. No guarantee is given that the represented
509 recentfiles exist or have been read. They are just bare objects.
511 =cut
513 sub recentfiles {
514 my($self) = @_;
515 my $rfs = $self->_recentfiles;
516 return $rfs if defined $rfs;
517 my $rf0 = $self->principal_recentfile;
518 my $pathdb = $self->_pathdb;
519 $rf0->_pathdb ($pathdb);
520 my $aggregator = $rf0->aggregator;
521 my @rf = $rf0;
522 for my $agg (@$aggregator) {
523 my $nrf = $rf0->_sparse_clone;
524 $nrf->interval ( $agg );
525 $nrf->have_mirrored ( 0 );
526 $nrf->_pathdb ( $pathdb );
527 push @rf, $nrf;
529 $self->_recentfiles(\@rf);
530 return \@rf;
533 =head2 $success = $obj->rmirror ( %options )
535 Mirrors all recentfiles of the I<remote> address working through all
536 of them, mirroring their contents.
538 Test this with:
540 use File::Rsync::Mirror::Recent;
541 my $rrr = File::Rsync::Mirror::Recent->new(
542 ignore_link_stat_errors => 1,
543 localroot => "/home/ftp/pub/PAUSE/authors",
544 remote => "pause.perl.org::authors/RECENT.recent",
545 max_files_per_connection => 5000,
546 rsync_options => {
547 compress => 1,
548 links => 1,
549 times => 1,
550 checksum => 0,
552 verbose => 1,
553 _runstatusfile => "recent-rmirror-state.yml",
554 _logfilefordone => "recent-rmirror-donelog.log",
556 $rrr->rmirror ( "skip-deletes" => 1, loop => 1 );
558 Or try without the loop parameter and write the loop yourself:
560 use File::Rsync::Mirror::Recent;
561 my @rrr;
562 for my $t ("authors","modules"){
563 my $rrr = File::Rsync::Mirror::Recent->new(
564 ignore_link_stat_errors => 1,
565 localroot => "/home/ftp/pub/PAUSE/$t",
566 remote => "pause.perl.org::$t/RECENT.recent",
567 max_files_per_connection => 512,
568 rsync_options => {
569 compress => 1,
570 links => 1,
571 times => 1,
572 checksum => 0,
574 verbose => 1,
575 _runstatusfile => "recent-rmirror-state-$t.yml",
576 _logfilefordone => "recent-rmirror-donelog-$t.log",
577 ttl => 5,
579 push @rrr, $rrr;
581 while (){
582 for my $rrr (@rrr){
583 $rrr->rmirror ( "skip-deletes" => 1 );
585 warn "sleeping 23\n"; sleep 23;
589 =cut
590 # _alluptodate is unused but at least it worked last time I needed it,
591 # so let us keep it around
592 sub _alluptodate {
593 my($self) = @_;
594 my $sdm = $self->_dirtymark;
595 return unless defined $sdm;
596 for my $rf (@{$self->recentfiles}) {
597 return if $rf->seeded;
598 my $rfdm = $rf->dirtymark;
599 return unless defined $rfdm;
600 return unless $rfdm eq $sdm;
601 my $done = $rf->done;
602 return unless defined $done;
603 my $done_intervals = $done->_intervals;
604 return if !defined $done_intervals;
605 # nonono, may be more than one, only covered it must be:
606 # return if @$done_intervals > 1;
607 my $minmax = $rf->minmax;
608 return unless defined $minmax;
609 return unless $done->covered(@$minmax{qw(max min)});
611 # $DB::single++;
612 return 1;
614 sub _fullseed {
615 my($self) = @_;
616 for ( @{$self->recentfiles} ) { $_->seed(1) }
618 sub rmirror {
619 my($self, %options) = @_;
621 my $rfs = $self->recentfiles;
623 $self->principal_recentfile->seed;
624 my $_sigint = sub {
625 # XXX exit gracefully (reminder)
628 # XXX needs accessor: warning, if set too low, we do nothing but
629 # mirror the principal!
630 my $minimum_time_per_loop = 20;
632 if (my $logfile = $self->_logfilefordone) {
633 for my $i (0..$#$rfs) {
634 $rfs->[$i]->done->_logfile($logfile);
637 if (my $dirtymark = $self->principal_recentfile->dirtymark) {
638 my $mydm = $self->_dirtymark;
639 if (!defined $mydm){
640 $self->_dirtymark($dirtymark);
641 } elsif ($dirtymark ne $mydm) {
642 if ($self->verbose) {
643 my $fh;
644 if (my $vl = $self->verboselog) {
645 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
646 } else {
647 $fh = \*STDERR;
649 print $fh "NewDirtymark: old[$mydm] new[$dirtymark]\n";
651 $self->_dirtymark($dirtymark);
654 my $rstfile = $self->runstatusfile;
655 unless ($self->_have_written_statusfile) {
656 $self->_rmirror_runstatusfile_write ($rstfile, \%options);
657 $self->_have_written_statusfile(1);
659 $self->_rmirror_loop($minimum_time_per_loop,\%options);
662 sub _rmirror_loop {
663 my($self,$minimum_time_per_loop,$options) = @_;
664 LOOP: while () {
665 my $ttleave = time + $minimum_time_per_loop;
666 my $rstfile = $self->runstatusfile;
667 my $otherproc = $self->_thaw_without_pathdb ($rstfile);
668 my $pid = fork;
669 if (! defined $pid) {
670 warn "Contention: $!";
671 sleep 0.25;
672 next LOOP;
673 } elsif ($pid) {
674 waitpid($pid,0);
675 } else {
676 $self = $self->thaw ($rstfile);
677 my $rfs = $self->recentfiles;
678 $self->principal_recentfile->seed;
679 RECENTFILE: for my $i (0..$#$rfs) {
680 my $rf = $rfs->[$i];
681 if (time > $ttleave) {
682 # Must make sure that one file can get fetched in any case
683 $self->_max_one_state(1);
685 if ($rf->seeded) {
686 $self->_rmirror_mirror ($i, $options);
687 } elsif ($rf->uptodate) {
688 if ($i < $#$rfs) {
689 $rfs->[$i+1]->done->merge($rf->done);
691 # no further seed necessary because "periodic" does it
692 next RECENTFILE;
694 WORKUNIT: while (time < $ttleave) {
695 if ($rf->uptodate) {
696 $self->_rmirror_sleep_per_connection ($i);
697 next RECENTFILE;
698 } else {
699 $self->_rmirror_mirror ($i, $options);
702 if ($self->_max_one_state) {
703 last RECENTFILE;
706 $self->_max_one_state(0);
707 my $exit = 0;
708 if ($rfs->[-1]->uptodate) {
709 $self->_rmirror_cleanup;
711 unless ($options->{loop}) {
712 $exit = 1;
714 $self->_rmirror_runstatusfile_write ($rstfile, $options);
715 exit if $exit;
716 last LOOP;
719 $otherproc = $self->_thaw_without_pathdb ($rstfile);
720 if (!$options->{loop} && $otherproc && $otherproc->recentfiles->[-1]->uptodate) {
721 last LOOP;
723 my $sleep = $ttleave - time;
724 if ($sleep > 0.01) {
725 $self->_rmirror_endofloop_sleep ($sleep);
726 } else {
727 # negative time not invented yet:)
732 sub _rmirror_mirror {
733 my($self, $i, $options) = @_;
734 my $rfs = $self->recentfiles;
735 my $rf = $rfs->[$i];
736 my %locopt = %$options;
737 if ($self->_max_one_state) {
738 $locopt{max} = 1;
740 $locopt{piecemeal} = 1;
741 $rf->mirror (%locopt);
742 if ($i==0) {
743 # we limit to 0 for the case that upstream is broken and has
744 # more than one timestamp (happened on PAUSE 200903)
745 if (my $dirtymark = $rf->dirtymark) {
746 my $mydm = $self->_dirtymark;
747 if (!defined $mydm or $dirtymark ne $mydm) {
748 $self->_dirtymark($dirtymark);
749 $self->_fullseed;
755 sub _rmirror_sleep_per_connection {
756 my($self, $i) = @_;
757 my $rfs = $self->recentfiles;
758 my $rf = $rfs->[$i];
759 my $sleep = $rf->sleep_per_connection;
760 $sleep = 0.42 unless defined $sleep;
761 Time::HiRes::sleep $sleep;
762 $rfs->[$i+1]->done->merge($rf->done) if $i < $#$rfs;
765 sub _rmirror_cleanup {
766 my($self) = @_;
767 my $pathdb = $self->_pathdb();
768 for my $k (keys %$pathdb) {
769 delete $pathdb->{$k};
771 my $rfs = $self->recentfiles;
772 for my $i (0..$#$rfs-1) {
773 my $thismerged = $rfs->[$i]->merged;
774 my $next = $rfs->[$i+1];
775 my $nextminmax = $next->minmax;
776 if (not defined $thismerged->{epoch} or _bigfloatlt($nextminmax->{max},$thismerged->{epoch})){
777 $next->seed;
782 =head2 $file = $obj->runstatusfile ($set)
784 Getter/setter for C<_runstatusfile> attribute. Defaults to a temporary
785 file created by C<File::Temp>. A status file is required for
786 C<rmirror> working. Since it may be interesting for debugging
787 purposes, you may want to specify a permanent file for this.
789 =cut
790 sub runstatusfile {
791 my($self,$set) = @_;
792 if (defined $set) {
793 $self->_runstatusfile ($set);
795 my $x = $self->_runstatusfile;
796 unless (defined $x) {
797 require File::Temp;
798 my $tfile = File::Temp->new
800 TEMPLATE => "Recent-XXXX",
801 TMPDIR => 1,
802 UNLINK => 0,
803 CLEANUP => 0,
804 SUFFIX => '.dat',
806 $self->_runstatusfile($tfile->filename);
808 return $self->_runstatusfile;
811 # unused code.... it was an oops, discovered the thaw() method too
812 # late, and starting writing this here....
813 sub _rmirror_runstatusfile_read {
814 my($self, $file) = @_;
816 require YAML::Syck;
817 my $start = time;
818 # XXX is locking useful here?
819 while (not mkdir "$file.lock") {
820 Time::HiRes::sleep 0.2;
821 warn "*** waiting for lock ***" if time - $start >= 3;
823 my $yml = YAML::Syck::LoadFile $file;
824 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
825 my $rself = $yml->{reduced_self};
826 my $rfs = $yml->{reduced_rfs};
827 # XXX bring them into self
830 sub _rmirror_runstatusfile_write {
831 my($self, $file, $options) = @_;
832 my $rself;
833 while (my($k,$v) = each %$self) {
834 next if $k =~ /^-(_principal_recentfile|_recentfiles)$/;
835 $rself->{$k} = $v;
837 my $rfs = $self->recentfiles;
838 my $rrfs;
839 for my $i (0..$#$rfs) {
840 my $rf = $rfs->[$i];
841 while (my($k,$v) = each %$rf) {
842 next if $k =~ /^-(_current_tempfile_fh|_pathdb|_rsync)$/;
843 $rrfs->[$i]{$k} = $rfs->[$i]{$k};
846 require YAML::Syck;
847 my $start = time;
848 while (not mkdir "$file.lock") {
849 Time::HiRes::sleep 0.15;
850 warn "*** waiting for lock ***" if time - $start >= 3;
852 YAML::Syck::DumpFile
854 "$file.new",
856 options => $options,
857 time => time,
858 reduced_rfs => $rrfs,
859 reduced_self => $rself,
861 rename "$file.new", $file or die "Could not rename: $!";
862 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
865 sub _rmirror_endofloop_sleep {
866 my($self, $sleep) = @_;
867 if ($self->verbose) {
868 my $fh;
869 if (my $vl = $self->verboselog) {
870 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
871 } else {
872 $fh = \*STDERR;
874 printf $fh
876 "Dorm %d (%s secs)\n",
877 time,
878 $sleep,
880 sleep $sleep;
884 # it returns two things: abslfile and rfilename. But the abslfile is
885 # undef when the rfilename ends in .recent. A weird interface, my
886 # friend.
887 sub _principal_recentfile_fromremote_resosymlink {
888 my($self, $rfilename) = @_;
889 $rfilename = "RECENT.recent" unless length $rfilename;
890 my $abslfile = undef;
891 my $fh;
892 if ($rfilename =~ /\.recent$/) {
893 # may be a file *or* a symlink,
894 ($abslfile,$fh) = $self->_fetch_as_tempfile ($rfilename);
895 while (-l $abslfile) {
896 my $symlink = readlink $abslfile;
897 if ($symlink =~ m|/|) {
898 die "FIXME: filenames containing '/' not supported, got '$symlink'";
900 my $localrfile = File::Spec->catfile($self->localroot, $rfilename);
901 if (-e $localrfile) {
902 my $old_symlink = readlink $localrfile;
903 if ($old_symlink eq $symlink) {
904 unlink $abslfile or die "Cannot unlink '$abslfile': $!";
905 } else {
906 unlink $localrfile; # may fail
907 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
909 } else {
910 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
912 ($abslfile,$fh) = $self->_fetch_as_tempfile ($symlink);
915 return ($abslfile, $rfilename, $fh);
918 # takes a basename, returns an absolute name, does not delete the
919 # file, throws the $fh away. Caller must rename or unlink
921 # XXX needs to activate the fh in the rf0 so that it is able to unlink
922 # the file. I would like that the file is used immediately by $rf0
923 sub _fetch_as_tempfile {
924 my($self, $rfile) = @_;
925 my($suffix) = $rfile =~ /(\.[^\.]+)$/;
926 $suffix = "" unless defined $suffix;
927 my $fh = File::Temp->new
928 (TEMPLATE => sprintf(".FRMRecent-%s-XXXX",
929 $rfile,
931 DIR => $self->tempdir || $self->localroot,
932 SUFFIX => $suffix,
933 UNLINK => 0,
935 my $rsync;
936 unless ($rsync = File::Rsync->new($self->rsync_options)) {
937 require Carp;
938 Carp::confess(YAML::Syck::Dump($self->rsync_options));
940 my $dst = $fh->filename;
941 local($ENV{LANG}) = "C";
942 $rsync->exec
944 src => join("/",$self->remoteroot,$rfile),
945 dst => $dst,
946 ) or die "Could not mirror '$rfile' to $fh\: ".join(" ",$rsync->err);
947 unless (-l $dst) {
948 my $mode = 0644;
949 chmod $mode, $dst or die "Could not chmod $mode '$dst': $!";
951 return($dst,$fh);
954 =head2 $verbose = $obj->verbose ( $set )
956 Getter/setter method to set verbosity for this F:R:M:Recent object and
957 all associated Recentfile objects.
959 =cut
960 sub verbose {
961 my($self,$set) = @_;
962 if (defined $set) {
963 for ( @{$self->recentfiles} ) { $_->verbose($set) }
964 $self->_verbose ($set);
966 my $x = $self->_verbose;
967 unless (defined $x) {
968 $x = 0;
969 $self->_verbose ($x);
971 return $x;
975 =head2 my $vl = $obj->verboselog ( $set )
977 Getter/setter method for the path to the logfile to write verbose
978 progress information to.
980 Note: This is a primitive stop gap solution to get simple verbose
981 logging working. The program still sends error messages to STDERR.
982 Switching to Log4perl or similar is probably the way to go. TBD.
984 =cut
985 sub verboselog {
986 my($self,$set) = @_;
987 if (defined $set) {
988 for ( @{$self->recentfiles} ) { $_->verboselog($set) }
989 $self->_verboselog ($set);
991 my $x = $self->_verboselog;
992 unless (defined $x) {
993 $x = 0;
994 $self->_verboselog ($x);
996 return $x;
999 =head1 THE ARCHITECTURE OF A COLLECTION OF RECENTFILES
1001 The idea is that we want to have a short file that records really
1002 recent changes. So that a fresh mirror can be kept fresh as long as
1003 the connectivity is given. Then we want longer files that record the
1004 history before. So when the mirror falls behind the update period
1005 reflected in the shortest file, it can complement the list of recent
1006 file events with the next one. And if this is not long enough we want
1007 another one, again a bit longer. And we want one that completes the
1008 history back to the oldest file. The index files do contain the
1009 complete list of current files. The longer a period covered by an
1010 index file is gone the less often the index file is updated. For
1011 practical reasons adjacent files will often overlap a bit but this is
1012 neither necessary nor enforced. That's the basic idea. The following
1013 example represents a tree that has a few updates every day:
1015 RECENT.recent -> RECENT-1h.yaml
1016 RECENT-1h.yaml
1017 RECENT-6h.yaml
1018 RECENT-1d.yaml
1019 RECENT-1M.yaml
1020 RECENT-1W.yaml
1021 RECENT-1Q.yaml
1022 RECENT-1Y.yaml
1023 RECENT-Z.yaml
1025 The first file is the principal file, in so far it is the one that is
1026 written first after a filesystem change. Usually a symlink links to it
1027 with a filename that has the same filenameroot and the suffix
1028 C<.recent>. On systems that do not support symlinks there is a plain
1029 copy maintained instead.
1031 The last file, the Z file, contains the complementary files that are
1032 in none of the other files. It may contain C<delete> events but often
1033 C<delete> events are discarded at the transition to the Z file.
1035 =head2 THE INDIVIDUAL RECENTFILE
1037 A I<recentfile> consists of a hash that has two keys: C<meta> and
1038 C<recent>. The C<meta> part has metadata and the C<recent> part has a
1039 list of fileobjects.
1041 =head2 THE META PART
1043 Here we find things that are pretty much self explaining: all
1044 lowercase attributes are accessors and as such explained in the
1045 manpages. The uppercase attribute C<Producers> contains version
1046 information about involved software components. Nothing to worry about
1047 as I believe.
1049 =head2 THE RECENT PART
1051 This is the interesting part. Every entry refers to some filesystem
1052 change (with path, epoch, type).
1054 The I<epoch> value is the point in time when some change was
1055 I<registered> but can be set to arbitrary values. Do not be tempted to
1056 believe that the entry has a direct relation to something like
1057 modification time or change time on the filesystem level. They are not
1058 reflecting release dates. (If you want exact release dates: Barbie is
1059 providing a database of them. See
1060 http://use.perl.org/~barbie/journal/37907).
1062 All these entries can be devided into two types (denoted by the
1063 I<type> attribute): C<new>s and C<delete>s. Changes and creations are
1064 C<new>s. Deletes are C<delete>s.
1066 Besides an I<epoch> and a I<type> attribute we find a third one:
1067 I<path>. This path is relative to the directory we find the
1068 I<recentfile> in.
1070 The order of the entries in the I<recentfile> is by decreasing epoch
1071 attribute. These are unique floating point numbers. When the server
1072 has ntp running correctly, then the timestamps are usually reflecting
1073 a real epoch. If time is running backwards, we trump the system epoch
1074 with strictly monotonically increasing floating point timestamps and
1075 guarantee they are unique.
1077 =head1 CORRUPTION AND RECOVERY
1079 If the origin host breaks the promise to deliver consistent and
1080 complete I<recentfiles> then it must update its C<dirtymark> and all
1081 slaves must discard what they cosider the truth. In the worst case
1082 that something goes wrong despite the dirtymark mechanism the way back
1083 to sanity can always be achieved through traditional rsyncing between
1084 the hosts.
1086 =head1 BACKGROUND
1088 This is about speeding up rsync operation on large trees. Uses a small
1089 metadata cocktail and pull technology.
1091 rersyncrecent solves this problem with a couple of (usually 2-10)
1092 lightweight index files which cover different overlapping time
1093 intervals. The master writes these files and the clients/slaves can
1094 construct the full tree from the information contained in them. The
1095 most recent index file usually covers the last seconds or minutes or
1096 hours of the tree and depending on the needs, slaves can rsync every
1097 few seconds or minutes and then bring their trees in full sync.
1099 The rersyncrecent model was developed for CPAN but as it is both
1100 convenient and economic it is also a general purpose solution. I'm
1101 looking forward to see a CPAN backbone that is only a few seconds
1102 behind PAUSE.
1104 =head2 NON-COMPETITORS
1106 File::Mirror JWU/File-Mirror/File-Mirror-0.10.tar.gz only local trees
1107 Mirror::YAML ADAMK/Mirror-YAML-0.03.tar.gz some sort of inner circle
1108 Net::DownloadMirror KNORR/Net-DownloadMirror-0.04.tar.gz FTP sites and stuff
1109 Net::MirrorDir KNORR/Net-MirrorDir-0.05.tar.gz dito
1110 Net::UploadMirror KNORR/Net-UploadMirror-0.06.tar.gz dito
1111 Pushmi::Mirror CLKAO/Pushmi-v1.0.0.tar.gz something SVK
1113 rsnapshot www.rsnapshot.org focus on backup
1114 csync www.csync.org more like unison
1115 multi-rsync sourceforge 167893 lan push to many
1116 chasm chasmd.org per-directory manifests
1118 =head2 COMPETITORS
1120 The problem to solve which clusters and ftp mirrors and otherwise
1121 replicated datasets like CPAN share: how to transfer only a minimum
1122 amount of data to determine the diff between two hosts.
1124 Normally it takes a long time to determine the diff itself before it
1125 can be transferred. Known solutions at the time of this writing are
1126 csync2, and rsync 3 batch mode.
1128 For many years the best solution was B<csync2> which solves the
1129 problem by maintaining a sqlite database on both ends and talking a
1130 highly sophisticated protocol to quickly determine which files to send
1131 and which to delete at any given point in time. Csync2 is often
1132 inconvenient because it is push technology and the act of syncing
1133 demands quite an intimate relationship between the sender and the
1134 receiver. This is hard to achieve in an environment of loosely coupled
1135 sites where the number of sites is large or connections are unreliable
1136 or network topology is changing.
1138 B<Rsync 3 batch mode> works around these problems by providing
1139 rsync-able batch files which allow receiving nodes to replay the
1140 history of the other nodes. This reduces the need to have an
1141 incestuous relation but it has the disadvantage that these batch files
1142 replicate the contents of the involved files. This seems inappropriate
1143 when the nodes already have a means of communicating over rsync.
1145 =head2 HONORABLE MENTION
1147 B<instantmirror> at https://fedorahosted.org/InstantMirror/ is an
1148 ambitious project that tries to combine various technologies (squid,
1149 bittorrent) to overcome the current slowness with the main focus on
1150 fedora. It's been founded in 2009-03 and at the time of this writing
1151 it is still a bit early to comment on.
1153 =head1 LIMITATIONS
1155 If the tree of the master server is changing faster than the bandwidth
1156 permits to mirror then additional protocols may need to be deployed.
1157 Certainly p2p/bittorrent can help in such situations because
1158 downloading sites help each other and bittorrent chunks large files
1159 into pieces.
1161 =head1 FUTURE DIRECTIONS
1163 Currently the origin server must keep track of injected and removed
1164 files. Should be supported by an inotify-based assistant.
1166 Convince other users outside the CPAN like
1167 http://fedoraproject.org/wiki/Infrastructure/Mirroring
1169 =head1 SEE ALSO
1171 L<File::Rsync::Mirror::Recentfile>,
1172 L<File::Rsync::Mirror::Recentfile::Done>,
1173 L<File::Rsync::Mirror::Recentfile::FakeBigFloat>
1175 =head1 BUGS
1177 Please report any bugs or feature requests through the web interface
1179 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=File-Rsync-Mirror-Recent>.
1180 I will be notified, and then you'll automatically be notified of
1181 progress on your bug as I make changes.
1183 =head1 SUPPORT
1185 You can find documentation for this module with the perldoc command.
1187 perldoc File::Rsync::Mirror::Recent
1189 You can also look for information at:
1191 =over 4
1193 =item * RT: CPAN's request tracker
1195 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=File-Rsync-Mirror-Recent>
1197 =item * AnnoCPAN: Annotated CPAN documentation
1199 L<http://annocpan.org/dist/File-Rsync-Mirror-Recent>
1201 =item * CPAN Ratings
1203 L<http://cpanratings.perl.org/d/File-Rsync-Mirror-Recent>
1205 =item * Search CPAN
1207 L<http://search.cpan.org/dist/File-Rsync-Mirror-Recent>
1209 =back
1212 =head1 ACKNOWLEDGEMENTS
1214 Thanks to RJBS for module-starter.
1216 =head1 AUTHOR
1218 Andreas König
1220 =head1 COPYRIGHT & LICENSE
1222 Copyright 2008, 2009 Andreas König.
1224 This program is free software; you can redistribute it and/or modify it
1225 under the same terms as Perl itself.
1228 =cut
1230 1; # End of File::Rsync::Mirror::Recent