up the version with perl-reversion
[rersyncrecent.git] / lib / File / Rsync / Mirror / Recent.pm
blob6429bb519b8470ec764ad4580f8dba555aaa663b
1 package File::Rsync::Mirror::Recent;
3 # use warnings;
4 use strict;
5 use File::Rsync::Mirror::Recentfile;
7 =encoding utf-8
9 =head1 NAME
11 File::Rsync::Mirror::Recent - mirroring via rsync made efficient
13 =cut
15 package File::Rsync::Mirror::Recent;
17 use File::Basename qw(basename dirname fileparse);
18 use File::Copy qw(cp);
19 use File::Path qw(mkpath);
20 use File::Rsync;
21 use File::Rsync::Mirror::Recentfile::FakeBigFloat qw(:all);
22 use File::Temp;
23 use List::Pairwise qw(mapp grepp);
24 use List::Util qw(first max);
25 use Scalar::Util qw(reftype);
26 use Storable;
27 use Time::HiRes qw();
28 use YAML::Syck;
30 use version; our $VERSION = qv('0.0.6');
32 =head1 SYNOPSIS
34 B<!!!! PRE-ALPHA ALERT !!!!>
36 Nothing in here is believed to be stable, nothing yet intended for
37 public consumption. The plan is to provide scripts that act as
38 frontends for all the backend functionality. Option and method names
39 may still change.
41 For the rationale see the section BACKGROUND.
43 The documentation in here is normally not needed because the code is
44 meant to be run from several standalone programs. For a quick
45 overview, see the file README.mirrorcpan and the bin/ directory of the
46 distribution. For the architectural ideas see the section THE
47 ARCHITECTURE OF A COLLECTION OF RECENTFILES below.
49 File::Rsync::Mirror::Recent establishes a view on a collection of
50 File::Rsync::Mirror::Recentfile objects and provides abstractions
51 spanning multiple time intervals associated with those.
53 =head1 EXPORT
55 No exports.
57 =head1 CONSTRUCTORS
59 =head2 my $obj = CLASS->new(%hash)
61 Constructor. On every argument pair the key is a method name and the
62 value is an argument to that method name.
64 =cut
66 sub new {
67 my($class, @args) = @_;
68 my $self = bless {}, $class;
69 while (@args) {
70 my($method,$arg) = splice @args, 0, 2;
71 $self->$method($arg);
73 return $self;
76 =head1 ACCESSORS
78 =cut
80 my @accessors;
82 BEGIN {
83 @accessors =
85 "__pathdb",
86 "_dirtymark", # keeps track of the dirtymark of the recentfiles
87 "_logfilefordone", # turns on _logfile on all DONE
88 # systems (disk intensive)
89 "_max_one_state", # when we have no time left but want
90 # at least get one file per
91 # iteration to avoid procrastination
92 "_principal_recentfile",
93 "_recentfiles",
94 "_rsync",
95 "_runstatusfile", # frequenty dumps all rfs
96 "_verbose", # internal variable for verbose setter/getter
97 "_verboselog", # internal variable for verboselog setter/getter
100 my @pod_lines =
101 split /\n/, <<'=cut'; push @accessors, grep {s/^=item\s+//} @pod_lines; }
103 =over 4
105 =item ignore_link_stat_errors
107 as in F:R:M:Recentfile
109 =item local
111 Option to specify the local principal file for operations with a local
112 collection of recentfiles.
114 =item localroot
116 as in F:R:M:Recentfile
118 =item max_files_per_connection
120 as in F:R:M:Recentfile
122 =item remote
124 The remote principal recentfile in rsync notation. E.g.
126 pause.perl.org::authors/RECENT.recent
128 =item remoteroot
130 as in F:R:M:Recentfile
132 =item remote_recentfile
134 Rsync address of the remote C<RECENT.recent> symlink or whichever name
135 the principal remote recentfile has.
137 =item rsync_options
139 Things like compress, links, times or checksums. Passed in to the
140 File::Rsync object used to run the mirror.
142 =item tempdir
144 as in F:R:M:Recentfile
146 =item ttl
148 Minimum time before fetching the principal recentfile again.
150 =back
152 =cut
154 use accessors @accessors;
156 =head1 METHODS
158 =head2 $arrayref = $obj->news ( %options )
160 Test this with:
162 perl -Ilib bin/rrr-news \
163 -after 1217200539 \
164 -max 12 \
165 -local /home/ftp/pub/PAUSE/authors/RECENT.recent
167 perl -Ilib bin/rrr-news \
168 -after 1217200539 \
169 -rsync=compress=1 \
170 -rsync=links=1 \
171 -localroot /home/ftp/pub/PAUSE/authors/ \
172 -remote pause.perl.org::authors/RECENT.recent
173 -verbose
175 Note: all parameters that can be passed to recent_events can also be specified here.
177 Note: all data are kept in memory
179 =cut
181 sub news {
182 my($self, %opt) = @_;
183 my $local = $self->local;
184 unless ($local) {
185 if (my $remote = $self->remote) {
186 my $localroot;
187 if ($localroot = $self->localroot) {
188 # nice, they know what they are doing
189 } else {
190 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
192 } else {
193 die "Alert: neither local nor remote specified, cannot continue";
196 my $rfs = $self->recentfiles;
197 my $ret = [];
198 my $before;
199 for my $rf (@$rfs) {
200 my %locopt = %opt;
201 $locopt{before} = $before;
202 if ($opt{max}) {
203 $locopt{max} -= scalar @$ret;
204 last if $locopt{max} <= 0;
206 $locopt{info} = {};
207 my $res = $rf->recent_events(%locopt);
208 if (@$res){
209 push @$ret, @$res;
211 if ($opt{max} && scalar @$ret > $opt{max}) {
212 last;
214 if ($opt{after}){
215 if ( $locopt{info}{last} && _bigfloatlt($locopt{info}{last}{epoch},$opt{after}) ) {
216 last;
218 if ( _bigfloatgt($opt{after},$locopt{info}{first}{epoch}) ) {
219 last;
222 if (!@$res){
223 next;
225 $before = $res->[-1]{epoch};
226 $before = $opt{before} if $opt{before} && _bigfloatlt($opt{before},$before);
228 $ret;
231 =head2 overview ( %options )
233 returns a small table that summarizes the state of all recentfiles
234 collected in this Recent object.
236 $options{verbose}=1 increases the number of columns displayed.
238 Here is an example output:
240 Ival Cnt Max Min Span Util Cloud
241 1h 47 1225053014.38 1225049650.91 3363.47 93.4% ^ ^
242 6h 324 1225052939.66 1225033394.84 19544.82 90.5% ^ ^
243 1d 437 1225049651.53 1224966402.53 83248.99 96.4% ^ ^
244 1W 1585 1225039015.75 1224435339.46 603676.29 99.8% ^ ^
245 1M 5855 1225017376.65 1222428503.57 2588873.08 99.9% ^ ^
246 1Q 17066 1224578930.40 1216803512.90 7775417.50 100.0% ^ ^
247 1Y 15901 1223966162.56 1216766820.67 7199341.89 22.8% ^ ^
248 Z 9909 1223966162.56 1216766820.67 7199341.89 - ^ ^
250 I<Max> is the name of the interval.
252 I<Cnt> is the number of entries in this recentfile.
254 I<Max> is the highest(first) epoch in this recentfile, rounded.
256 I<Min> is the lowest(last) epoch in thie recentfile, rounded.
258 I<Span> is the timespan currently covered, rounded.
260 I<Util> is I<Span> devided by the designated timespan of this
261 recentfile.
263 I<Cloud> is ascii art illustrating the sequence of the Max and Min
264 timestamps.
266 =cut
267 sub overview {
268 my($self,%options) = @_;
269 my $rfs = $self->recentfiles;
270 my(@s,%rank);
271 RECENTFILE: for my $rf (@$rfs) {
272 my $re=$rf->recent_events;
273 my $rfsummary;
274 if (@$re) {
275 my $span = $re->[0]{epoch}-$re->[-1]{epoch};
276 my $merged = $rf->merged;
277 $rfsummary =
279 "Ival",
280 $rf->interval,
281 "Cnt",
282 scalar @$re,
283 "Dirtymark",
284 $rf->dirtymark ? sprintf("%.2f",$rf->dirtymark) : "-",
285 "Merged",
286 ($rf->interval eq "Z"
290 sprintf ("%.2f", $merged->{epoch} || 0)),
291 "Max",
292 sprintf ("%.2f", $re->[0]{epoch}),
293 "Min",
294 sprintf ("%.2f", $re->[-1]{epoch}),
295 "Span",
296 sprintf ("%.2f", $span),
297 "Util", # u9n:)
298 ($rf->interval eq "Z"
302 sprintf ("%5.1f%%", 100 * $span / $rf->interval_secs)
305 @rank{mapp {$b} grepp {$a =~ /^(Max|Min)$/} @$rfsummary} = ();
306 } else {
307 next RECENTFILE;
309 push @s, $rfsummary;
311 @rank{sort {$b <=> $a} keys %rank} = 1..keys %rank;
312 my $maxrank = max values %rank;
313 for my $rfsummary (@s) {
314 my $string = " " x $maxrank;
315 my @borders;
316 for my $ele (qw(Max Min)) {
317 my($r) = mapp {$b} grepp {$a eq $ele} @$rfsummary;
318 push @borders, $rank{$r}-1;
320 for ($borders[0],$borders[1]) {
321 substr($string,$_,1) = "^";
323 push @$rfsummary, "Cloud", $string;
325 unless ($options{verbose}) {
326 my %filter = map {($_=>1)} qw(Ival Cnt Max Min Span Util Cloud);
327 for (@s) {
328 $_ = [mapp {($a,$b)} grepp {!!$filter{$a}} @$_];
331 my @sprintf;
332 for (my $i = 0; $i <= $#{$s[0]}; $i+=2) {
333 my $maxlength = max ((map { length $_->[$i+1] } @s), length $s[0][$i]);
334 push @sprintf, "%" . $maxlength . "s";
336 my $sprintf = join " ", @sprintf;
337 $sprintf .= "\n";
338 my $headline = sprintf $sprintf, mapp {$a} @{$s[0]};
339 join "", $headline, map { sprintf $sprintf, mapp {$b} @$_ } @s;
342 =head2 _pathdb
344 Keeping track of already handled files. Currently it is a hash, will
345 probably become a database with its own accessors.
347 =cut
349 sub _pathdb {
350 my($self, $set) = @_;
351 if ($set) {
352 $self->__pathdb ($set);
354 my $pathdb = $self->__pathdb;
355 unless (defined $pathdb) {
356 $self->__pathdb(+{});
358 return $self->__pathdb;
361 =head2 $recentfile = $obj->principal_recentfile ()
363 returns the principal recentfile object of this tree.
365 =cut
366 # mirrors the recentfile and instantiates the recentfile object
367 sub _principal_recentfile_fromremote {
368 my($self) = @_;
369 # get the remote recentfile
370 my $rrfile = $self->remote or die "Alert: cannot construct a recentfile object without the 'remote' attribute";
371 my $splitter = qr{(.+)/([^/]*)};
372 my($remoteroot,$rfilename) = $rrfile =~ $splitter;
373 $self->remoteroot($remoteroot);
374 my($abslfile, $fh);
375 if (!defined $rfilename) {
376 die "Alert: Cannot resolve '$rrfile', does not match $splitter";
377 } elsif (not length $rfilename or $rfilename eq "RECENT.recent") {
378 ($abslfile,$rfilename,$fh) = $self->_principal_recentfile_fromremote_resosymlink($rfilename);
380 my @need_args =
382 "ignore_link_stat_errors",
383 "localroot",
384 "max_files_per_connection",
385 "remoteroot",
386 "rsync_options",
387 "tempdir",
388 "ttl",
389 "verbose",
390 "verboselog",
392 my $rf0;
393 unless ($abslfile) {
394 $rf0 = File::Rsync::Mirror::Recentfile->new (map {($_ => $self->$_)} @need_args);
395 $rf0->split_rfilename($rfilename);
396 $abslfile = $rf0->get_remote_recentfile_as_tempfile ();
398 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ( $abslfile );
399 $rf0->_current_tempfile ( $abslfile );
400 $rf0->_current_tempfile_fh ( $fh );
401 $rf0->_use_tempfile (1);
402 for my $override (@need_args) {
403 $rf0->$override ( $self->$override );
405 $rf0->is_slave (1);
406 return $rf0;
408 sub principal_recentfile {
409 my($self) = @_;
410 my $rf0 = $self->_principal_recentfile;
411 return $rf0 if defined $rf0;
412 my $local = $self->local;
413 if ($local) {
414 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ($local);
415 } else {
416 if (my $remote = $self->remote) {
417 my $localroot;
418 if ($localroot = $self->localroot) {
419 # nice, they know what they are doing
420 } else {
421 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
423 $rf0 = $self->_principal_recentfile_fromremote;
424 } else {
425 die "Alert: neither local nor remote specified, cannot continue";
428 $self->_principal_recentfile($rf0);
429 return $rf0;
432 =head2 $recentfiles_arrayref = $obj->recentfiles ()
434 returns a reference to the complete list of recentfile objects that
435 describe this tree. No guarantee is given that the represented
436 recentfiles exist or have been read. They are just bare objects.
438 =cut
440 sub recentfiles {
441 my($self) = @_;
442 my $rfs = $self->_recentfiles;
443 return $rfs if defined $rfs;
444 my $rf0 = $self->principal_recentfile;
445 my $pathdb = $self->_pathdb;
446 $rf0->_pathdb ($pathdb);
447 my $aggregator = $rf0->aggregator;
448 my @rf = $rf0;
449 for my $agg (@$aggregator) {
450 my $nrf = $rf0->_sparse_clone;
451 $nrf->interval ( $agg );
452 $nrf->have_mirrored ( 0 );
453 $nrf->_pathdb ( $pathdb );
454 push @rf, $nrf;
456 $self->_recentfiles(\@rf);
457 return \@rf;
460 =head2 $success = $obj->rmirror ( %options )
462 Mirrors all recentfiles of the I<remote> address working through all
463 of them, mirroring their contents.
465 Test this with:
467 use File::Rsync::Mirror::Recent;
468 my $rrr = File::Rsync::Mirror::Recent->new(
469 ignore_link_stat_errors => 1,
470 localroot => "/home/ftp/pub/PAUSE/authors",
471 remote => "pause.perl.org::authors/RECENT.recent",
472 max_files_per_connection => 5000,
473 rsync_options => {
474 compress => 1,
475 links => 1,
476 times => 1,
477 checksum => 0,
479 verbose => 1,
480 _runstatusfile => "recent-rmirror-state.yml",
481 _logfilefordone => "recent-rmirror-donelog.log",
483 $rrr->rmirror ( "skip-deletes" => 1, loop => 1 );
485 Or try without the loop parameter and write the loop yourself:
487 use File::Rsync::Mirror::Recent;
488 my @rrr;
489 for my $t ("authors","modules"){
490 my $rrr = File::Rsync::Mirror::Recent->new(
491 ignore_link_stat_errors => 1,
492 localroot => "/home/ftp/pub/PAUSE/$t",
493 remote => "pause.perl.org::$t/RECENT.recent",
494 max_files_per_connection => 512,
495 rsync_options => {
496 compress => 1,
497 links => 1,
498 times => 1,
499 checksum => 0,
501 verbose => 1,
502 _runstatusfile => "recent-rmirror-state-$t.yml",
503 _logfilefordone => "recent-rmirror-donelog-$t.log",
504 ttl => 5,
506 push @rrr, $rrr;
508 while (){
509 for my $rrr (@rrr){
510 $rrr->rmirror ( "skip-deletes" => 1 );
512 warn "sleeping 23\n"; sleep 23;
516 =cut
517 # _alluptodate is unused but at least it worked last time I needed it,
518 # so let us keep it around
519 sub _alluptodate {
520 my($self) = @_;
521 my $sdm = $self->_dirtymark;
522 return unless defined $sdm;
523 for my $rf (@{$self->recentfiles}) {
524 return if $rf->seeded;
525 my $rfdm = $rf->dirtymark;
526 return unless defined $rfdm;
527 return unless $rfdm eq $sdm;
528 my $done = $rf->done;
529 return unless defined $done;
530 my $done_intervals = $done->_intervals;
531 return if !defined $done_intervals;
532 # nonono, may be more than one, only covered it must be:
533 # return if @$done_intervals > 1;
534 my $minmax = $rf->minmax;
535 return unless defined $minmax;
536 return unless $done->covered(@$minmax{qw(max min)});
538 # $DB::single++;
539 return 1;
541 sub _fullseed {
542 my($self) = @_;
543 for ( @{$self->recentfiles} ) { $_->seed(1) }
545 sub rmirror {
546 my($self, %options) = @_;
548 my $rfs = $self->recentfiles;
550 my $_every_20_seconds = sub {
551 $self->principal_recentfile->seed;
553 $_every_20_seconds->();
554 my $_sigint = sub {
555 # XXX exit gracefully (reminder)
558 # XXX needs accessor: warning, if set too low, we do nothing but
559 # mirror the principal!
560 my $minimum_time_per_loop = 20;
562 if (my $logfile = $self->_logfilefordone) {
563 for my $i (0..$#$rfs) {
564 $rfs->[$i]->done->_logfile($logfile);
567 if (my $dirtymark = $self->principal_recentfile->dirtymark) {
568 my $mydm = $self->_dirtymark;
569 if (!defined $mydm){
570 $self->_dirtymark($dirtymark);
571 } elsif ($dirtymark ne $mydm) {
572 if ($self->verbose) {
573 my $fh;
574 if (my $vl = $self->verboselog) {
575 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
576 } else {
577 $fh = \*STDERR;
579 print $fh "NewDirtymark: old[$mydm] new[$dirtymark]\n";
581 $self->_dirtymark($dirtymark);
584 LOOP: while () {
585 my $ttleave = time + $minimum_time_per_loop;
586 RECENTFILE: for my $i (0..$#$rfs) {
587 my $rf = $rfs->[$i];
588 if (my $file = $self->_runstatusfile) {
589 $self->_rmirror_runstatusfile ($file, $i, \%options);
591 if (time > $ttleave){
592 # Must make sure that one file can get fetched in any case
593 $self->_max_one_state(1);
595 if ($rf->seeded) {
596 $self->_rmirror_mirror ($i, \%options);
597 } elsif ($rf->uptodate){
598 if ($i < $#$rfs){
599 $rfs->[$i+1]->done->merge($rf->done);
601 # no further seed necessary because "every_20_seconds" does it
602 next RECENTFILE;
604 WORKUNIT: while (time < $ttleave) {
605 if ($rf->uptodate) {
606 $self->_rmirror_sleep_per_connection ($i);
607 next RECENTFILE;
608 } else {
609 $self->_rmirror_mirror ($i, \%options);
612 if ($self->_max_one_state) {
613 last RECENTFILE;
616 $self->_max_one_state(0);
617 if ($rfs->[-1]->uptodate) {
618 $self->_rmirror_cleanup;
619 if ($options{loop}) {
620 } else {
621 last LOOP;
624 my $sleep = $ttleave - time;
625 if ($sleep > 0.01) {
626 $self->_rmirror_endofloop_sleep ($sleep);
627 } else {
628 # negative time not invented yet:)
630 $_every_20_seconds->();
634 sub _rmirror_mirror {
635 my($self, $i, $options) = @_;
636 my $rfs = $self->recentfiles;
637 my $rf = $rfs->[$i];
638 my %locopt = %$options;
639 if ($self->_max_one_state) {
640 $locopt{max} = 1;
642 $locopt{piecemeal} = 1;
643 $rf->mirror (%locopt);
644 if ($i==0) {
645 # we limit to 0 for the case that upstream is broken and has
646 # more than one timestamp (happened on PAUSE 200903)
647 if (my $dirtymark = $rf->dirtymark) {
648 my $mydm = $self->_dirtymark;
649 if (!defined $mydm or $dirtymark ne $mydm) {
650 $self->_dirtymark($dirtymark);
651 $self->_fullseed;
657 sub _rmirror_sleep_per_connection {
658 my($self, $i) = @_;
659 my $rfs = $self->recentfiles;
660 my $rf = $rfs->[$i];
661 my $sleep = $rf->sleep_per_connection;
662 $sleep = 0.42 unless defined $sleep;
663 Time::HiRes::sleep $sleep;
664 $rfs->[$i+1]->done->merge($rf->done) if $i < $#$rfs;
667 sub _rmirror_cleanup {
668 my($self) = @_;
669 my $pathdb = $self->_pathdb();
670 for my $k (keys %$pathdb) {
671 delete $pathdb->{$k};
673 my $rfs = $self->recentfiles;
674 for my $i (0..$#$rfs-1) {
675 my $thismerged = $rfs->[$i]->merged;
676 my $next = $rfs->[$i+1];
677 my $nextminmax = $next->minmax;
678 # warn "DEBUG: i[$i] nextminmaxmax[$nextminmax->{max}] thismergedepoch[$thismerged->{epoch}]";
679 if (not defined $thismerged->{epoch} or _bigfloatlt($nextminmax->{max},$thismerged->{epoch})){
680 $next->seed;
681 # warn sprintf "DEBUG: next iv %s seeded since next-minmax-max[$nextminmax->{max}]lt this-merged-epoch[$thismerged->{epoch}]\n", $next->interval;
686 sub _rmirror_runstatusfile {
687 my($self, $file, $i, $options) = @_;
688 my $rfs = $self->recentfiles;
689 require YAML::Syck;
690 YAML::Syck::DumpFile
692 $file,
693 {i => $i,
694 options => $options,
695 self => [keys %$self], # passing $self leaks, dclone refuses because of globs
696 time => time,
697 uptodate => {map {($_=>$rfs->[$_]->uptodate)} 0..$#$rfs},
701 sub _rmirror_endofloop_sleep {
702 my($self, $sleep) = @_;
703 if ($self->verbose) {
704 my $fh;
705 if (my $vl = $self->verboselog) {
706 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
707 } else {
708 $fh = \*STDERR;
710 printf $fh
712 "Dorm %d (%s secs)\n",
713 time,
714 $sleep,
716 sleep $sleep;
720 # it returns two things: abslfile and rfilename. But the abslfile is
721 # undef when the rfilename ends in .recent. A weird interface, my
722 # friend.
723 sub _principal_recentfile_fromremote_resosymlink {
724 my($self, $rfilename) = @_;
725 $rfilename = "RECENT.recent" unless length $rfilename;
726 my $abslfile = undef;
727 my $fh;
728 if ($rfilename =~ /\.recent$/) {
729 # may be a file *or* a symlink,
730 ($abslfile,$fh) = $self->_fetch_as_tempfile ($rfilename);
731 while (-l $abslfile) {
732 my $symlink = readlink $abslfile;
733 if ($symlink =~ m|/|) {
734 die "FIXME: filenames containing '/' not supported, got '$symlink'";
736 my $localrfile = File::Spec->catfile($self->localroot, $rfilename);
737 if (-e $localrfile) {
738 my $old_symlink = readlink $localrfile;
739 if ($old_symlink eq $symlink) {
740 unlink $abslfile or die "Cannot unlink '$abslfile': $!";
741 } else {
742 unlink $localrfile; # may fail
743 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
745 } else {
746 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
748 ($abslfile,$fh) = $self->_fetch_as_tempfile ($symlink);
751 return ($abslfile, $rfilename, $fh);
754 # takes a basename, returns an absolute name, does not delete the
755 # file, throws the $fh away. Caller must rename or unlink
757 # XXX needs to activate the fh in the rf0 so that it is able to unlink
758 # the file. I would like that the file is used immediately by $rf0
759 sub _fetch_as_tempfile {
760 my($self, $rfile) = @_;
761 my($suffix) = $rfile =~ /(\.[^\.]+)$/;
762 $suffix = "" unless defined $suffix;
763 my $fh = File::Temp->new
764 (TEMPLATE => sprintf(".FRMRecent-%s-XXXX",
765 $rfile,
767 DIR => $self->tempdir || $self->localroot,
768 SUFFIX => $suffix,
769 UNLINK => 0,
771 my $rsync;
772 unless ($rsync = File::Rsync->new($self->rsync_options)) {
773 require Carp;
774 Carp::confess(YAML::Syck::Dump($self->rsync_options));
776 my $dst = $fh->filename;
777 $rsync->exec
779 src => join("/",$self->remoteroot,$rfile),
780 dst => $dst,
781 ) or die "Could not mirror '$rfile' to $fh\: ".join(" ",$rsync->err);
782 unless (-l $dst) {
783 my $mode = 0644;
784 chmod $mode, $dst or die "Could not chmod $mode '$dst': $!";
786 return($dst,$fh);
789 =head2 $verbose = $obj->verbose ( $set )
791 Getter/setter method to set verbosity for this F:R:M:Recent object and
792 all associated Recentfile objects.
794 =cut
795 sub verbose {
796 my($self,$set) = @_;
797 if (defined $set) {
798 for ( @{$self->recentfiles} ) { $_->verbose($set) }
799 $self->_verbose ($set);
801 my $x = $self->_verbose;
802 unless (defined $x) {
803 $x = 0;
804 $self->_verbose ($x);
806 return $x;
810 =head2 my $vl = $obj->verboselog ( $set )
812 Getter/setter method for the path to the logfile to write verbose
813 progress information to.
815 Note: This is a primitive stop gap solution to get simple verbose
816 logging working. The program still sends error messages to STDERR.
817 Switching to Log4perl or similar is probably the way to go. TBD.
819 =cut
820 sub verboselog {
821 my($self,$set) = @_;
822 if (defined $set) {
823 for ( @{$self->recentfiles} ) { $_->verboselog($set) }
824 $self->_verboselog ($set);
826 my $x = $self->_verboselog;
827 unless (defined $x) {
828 $x = 0;
829 $self->_verboselog ($x);
831 return $x;
834 =head1 THE ARCHITECTURE OF A COLLECTION OF RECENTFILES
836 The idea is that we want to have a short file that records really
837 recent changes. So that a fresh mirror can be kept fresh as long as
838 the connectivity is given. Then we want longer files that record the
839 history before. So when the mirror falls behind the update period
840 reflected in the shortest file, it can complement the list of recent
841 file events with the next one. And if this is not long enough we want
842 another one, again a bit longer. And we want one that completes the
843 history back to the oldest file. The index files do contain the
844 complete list of current files. The longer a period covered by an
845 index file is gone the less often the index file is updated. For
846 practical reasons adjacent files will often overlap a bit but this is
847 neither necessary nor enforced. That's the basic idea. The following
848 example represents a tree that has a few updates every day:
850 RECENT.recent -> RECENT-1h.yaml
851 RECENT-1h.yaml
852 RECENT-6h.yaml
853 RECENT-1d.yaml
854 RECENT-1M.yaml
855 RECENT-1W.yaml
856 RECENT-1Q.yaml
857 RECENT-1Y.yaml
858 RECENT-Z.yaml
860 The first file is the principal file, in so far it is the one that is
861 written first after a filesystem change. Usually a symlink links to it
862 with a filename that has the same filenameroot and the suffix
863 C<.recent>. On systems that do not support symlinks there is a plain
864 copy maintained instead.
866 The last file, the Z file, contains the complementary files that are
867 in none of the other files. It may contain C<delete> events but often
868 C<delete> events are discarded at the transition to the Z file.
870 =head2 THE INDIVIDUAL RECENTFILE
872 A I<recentfile> consists of a hash that has two keys: C<meta> and
873 C<recent>. The C<meta> part has metadata and the C<recent> part has a
874 list of fileobjects.
876 =head2 THE META PART
878 Here we find things that are pretty much self explaining: all
879 lowercase attributes are accessors and as such explained in the
880 manpages. The uppercase attribute C<Producers> contains version
881 information about involved software components. Nothing to worry about
882 as I believe.
884 =head2 THE RECENT PART
886 This is the interesting part. Every entry refers to some filesystem
887 change (with path, epoch, type).
889 The I<epoch> value is the point in time when some change was
890 I<registered> but can be set to arbitrary values. Do not be tempted to
891 believe that the entry has a direct relation to something like
892 modification time or change time on the filesystem level. They are not
893 reflecting release dates. (If you want exact release dates: Barbie is
894 providing a database of them. See
895 http://use.perl.org/~barbie/journal/37907).
897 All these entries can be devided into two types (denoted by the
898 I<type> attribute): C<new>s and C<delete>s. Changes and creations are
899 C<new>s. Deletes are C<delete>s.
901 Besides an I<epoch> and a I<type> attribute we find a third one:
902 I<path>. This path is relative to the directory we find the
903 I<recentfile> in.
905 The order of the entries in the I<recentfile> is by decreasing epoch
906 attribute. These are unique floating point numbers. When the server
907 has ntp running correctly, then the timestamps are usually reflecting
908 a real epoch. If time is running backwards, we trump the system epoch
909 with strictly monotonically increasing floating point timestamps and
910 guarantee they are unique.
912 =head1 CORRUPTION AND RECOVERY
914 If the origin host breaks the promise to deliver consistent and
915 complete I<recentfiles> then it must update its C<dirtymark> and all
916 slaves must discard what they cosider the truth. In the worst case
917 that something goes wrong despite the dirtymark mechanism the way back
918 to sanity can always be achieved through traditional rsyncing between
919 the hosts.
921 =head1 BACKGROUND
923 This is about speeding up rsync operation on large trees. Uses a small
924 metadata cocktail and pull technology.
926 =head2 NON-COMPETITORS
928 File::Mirror JWU/File-Mirror/File-Mirror-0.10.tar.gz only local trees
929 Mirror::YAML ADAMK/Mirror-YAML-0.03.tar.gz some sort of inner circle
930 Net::DownloadMirror KNORR/Net-DownloadMirror-0.04.tar.gz FTP sites and stuff
931 Net::MirrorDir KNORR/Net-MirrorDir-0.05.tar.gz dito
932 Net::UploadMirror KNORR/Net-UploadMirror-0.06.tar.gz dito
933 Pushmi::Mirror CLKAO/Pushmi-v1.0.0.tar.gz something SVK
935 rsnapshot www.rsnapshot.org focus on backup
936 csync www.csync.org more like unison
937 multi-rsync sourceforge 167893 lan push to many
939 =head2 COMPETITORS
941 The problem to solve which clusters and ftp mirrors and otherwise
942 replicated datasets like CPAN share: how to transfer only a minimum
943 amount of data to determine the diff between two hosts.
945 Normally it takes a long time to determine the diff itself before it
946 can be transferred. Known solutions at the time of this writing are
947 csync2, and rsync 3 batch mode.
949 For many years the best solution was B<csync2> which solves the
950 problem by maintaining a sqlite database on both ends and talking a
951 highly sophisticated protocol to quickly determine which files to send
952 and which to delete at any given point in time. Csync2 is often
953 inconvenient because it is push technology and the act of syncing
954 demands quite an intimate relationship between the sender and the
955 receiver. This is hard to achieve in an environment of loosely coupled
956 sites where the number of sites is large or connections are unreliable
957 or network topology is changing.
959 B<Rsync 3 batch mode> works around these problems by providing
960 rsync-able batch files which allow receiving nodes to replay the
961 history of the other nodes. This reduces the need to have an
962 incestuous relation but it has the disadvantage that these batch files
963 replicate the contents of the involved files. This seems inappropriate
964 when the nodes already have a means of communicating over rsync.
966 B<instantmirror> at https://fedorahosted.org/InstantMirror/ is an
967 ambitious project that tries to combine various technologies to
968 overcome the current situation. It's been founded in 2009-03 and at
969 the time of this writing it is still a bit early to comment on.
971 rersyncrecent solves this problem with a couple of (usually 2-10)
972 lightweight index files which cover different overlapping time
973 intervals. The master writes these files and the clients/slaves can
974 construct the full tree from the information contained in them. The
975 most recent index file usually covers the last seconds or minutes or
976 hours of the tree and depending on the needs, slaves can rsync every
977 few seconds or minutes and then bring their trees in full sync.
979 The rersyncrecent mode was developed for CPAN but as it is convenient
980 and economic it is also a general purpose solution. I'm looking
981 forward to see a CPAN backbone that is only a few seconds behind
982 PAUSE. And then ... the first FUSE based CPAN filesystem anyone?
984 =head1 LIMITATIONS
986 If the tree of the master server is changing faster than the bandwidth
987 permits to mirror then additional protocols may need to be deployed.
988 Certainly p2p/bittorrent can help in such situations because
989 downloading sites help each other and bittorrent chunks large files
990 into pieces.
992 =head1 FUTURE DIRECTIONS
994 Currently the origin server must keep track of injected and removed
995 files. Should be supported by an inotify-based assistant.
997 Convince other users outside the CPAN like
998 http://fedoraproject.org/wiki/Infrastructure/Mirroring
1000 =head1 SEE ALSO
1002 L<File::Rsync::Mirror::Recentfile>,
1003 L<File::Rsync::Mirror::Recentfile::Done>,
1004 L<File::Rsync::Mirror::Recentfile::FakeBigFloat>
1006 =head1 BUGS
1008 Please report any bugs or feature requests through the web interface
1010 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=File-Rsync-Mirror-Recent>.
1011 I will be notified, and then you'll automatically be notified of
1012 progress on your bug as I make changes.
1014 =head1 SUPPORT
1016 You can find documentation for this module with the perldoc command.
1018 perldoc File::Rsync::Mirror::Recent
1020 You can also look for information at:
1022 =over 4
1024 =item * RT: CPAN's request tracker
1026 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=File-Rsync-Mirror-Recent>
1028 =item * AnnoCPAN: Annotated CPAN documentation
1030 L<http://annocpan.org/dist/File-Rsync-Mirror-Recent>
1032 =item * CPAN Ratings
1034 L<http://cpanratings.perl.org/d/File-Rsync-Mirror-Recent>
1036 =item * Search CPAN
1038 L<http://search.cpan.org/dist/File-Rsync-Mirror-Recent>
1040 =back
1043 =head1 ACKNOWLEDGEMENTS
1045 Thanks to RJBS for module-starter.
1047 =head1 AUTHOR
1049 Andreas König
1051 =head1 COPYRIGHT & LICENSE
1053 Copyright 2008, 2009 Andreas König.
1055 This program is free software; you can redistribute it and/or modify it
1056 under the same terms as Perl itself.
1059 =cut
1061 1; # End of File::Rsync::Mirror::Recent