small doc clarifications
[rersyncrecent.git] / lib / File / Rsync / Mirror / Recent.pm
blobabd08141ab0e6e42ea485ba6e00060c29d7ea61b
1 package File::Rsync::Mirror::Recent;
3 # use warnings;
4 use strict;
5 use File::Rsync::Mirror::Recentfile;
7 =encoding utf-8
9 =head1 NAME
11 File::Rsync::Mirror::Recent - mirroring via rsync made efficient
13 =cut
15 package File::Rsync::Mirror::Recent;
17 use File::Basename qw(basename dirname fileparse);
18 use File::Copy qw(cp);
19 use File::Path qw(mkpath);
20 use File::Rsync;
21 use File::Rsync::Mirror::Recentfile::Done (); # at least needed by thaw()
22 use File::Rsync::Mirror::Recentfile::FakeBigFloat qw(:all);
23 use File::Temp;
24 use List::Pairwise qw(mapp grepp);
25 use List::Util qw(first max);
26 use Scalar::Util qw(blessed reftype);
27 use Storable;
28 use Time::HiRes qw();
29 use YAML::Syck;
31 use version; our $VERSION = qv('0.0.8');
33 =head1 SYNOPSIS
35 The documentation in here is normally not needed because the code is
36 considered to be run from several standalone programs. For a quick
37 overview, see the file README.mirrorcpan and the bin/ directory of the
38 distribution. For the architectural ideas see the section THE
39 ARCHITECTURE OF A COLLECTION OF RECENTFILES below.
41 File::Rsync::Mirror::Recent establishes a view on a collection of
42 File::Rsync::Mirror::Recentfile objects and provides abstractions
43 spanning multiple time intervals associated with those.
45 =head1 EXPORT
47 No exports.
49 =head1 CONSTRUCTORS
51 =head2 my $obj = CLASS->new(%hash)
53 Constructor. On every argument pair the key is a method name and the
54 value is an argument to that method name.
56 =cut
58 sub new {
59 my($class, @args) = @_;
60 my $self = bless {}, $class;
61 while (@args) {
62 my($method,$arg) = splice @args, 0, 2;
63 $self->$method($arg);
65 return $self;
68 =head2 my $obj = CLASS->thaw($statusfile)
70 Constructor from a statusfile left over from a previous
71 rmirror run. See also C<runstatusfile>.
73 =cut
75 sub _thaw_without_pathdb {
76 my($self,$file) = @_;
77 open my $fh, $file or die "Can't open '$file': $!";
78 local $/ = "\n";
79 my $in_pathdb = 0;
80 my $tfile = File::Temp->new
82 TEMPLATE => "Recent-thaw-XXXX",
83 TMPDIR => 1,
84 UNLINK => 0,
85 CLEANUP => 0,
86 SUFFIX => '.dat',
88 my $template_for_eop;
89 while (<$fh>) {
90 if ($in_pathdb) {
91 if (/$template_for_eop/) {
92 $in_pathdb = 0;
94 } elsif (/(\s+)-\s*__pathdb\s*:/) {
95 $in_pathdb = 1;
96 my $next_attr = sprintf "^%s\\S", " ?" x length($1);
97 $template_for_eop = qr{$next_attr};
99 print $tfile $_ unless $in_pathdb;
101 close $tfile or die "Could not close: $!";
102 my $return = $self->thaw($tfile->filename);
103 $return->_havelostpathdb(1);
104 unlink $tfile->filename;
105 return $return;
107 sub thaw {
108 my($self, $file) = @_;
109 die "thaw called without statusfile argument" unless defined $file;
110 unless (-e $file){
111 require Carp;
112 Carp::confess("Alert: statusfile '$file' not found");
114 require YAML::Syck;
115 my $start = time;
116 my $sleeptime = 0.02;
117 while (not mkdir "$file.lock") {
118 my $err = $!;
119 Time::HiRes::sleep $sleeptime;
120 my $waiting = time - $start;
121 if ($waiting >= 3){
122 warn "*** waiting ($waiting) for lock ($err) ***";
123 $sleeptime = 1;
126 my $size = -s $file;
127 my $serialized = YAML::Syck::LoadFile($file);
128 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
129 my $charged_self = $serialized->{reduced_self};
130 my $class = blessed $self;
131 bless $charged_self, $class;
132 my $rfs = $serialized->{reduced_rfs};
133 my $rfclass = $class . "file"; # "Recent" . "file"
134 my $pathdb = $charged_self->_pathdb;
135 for my $rf (@$rfs) {
136 bless $rf, $rfclass;
137 $rf->_pathdb($pathdb);
139 $charged_self->_recentfiles($rfs);
140 $charged_self->_principal_recentfile($rfs->[0]);
141 # die "FIXME: thaw all recentfiles from reduced_rfs into _recentfiles as well, watch out for pathdb and rsync";
142 return $charged_self;
145 =head1 ACCESSORS
147 =cut
149 my @accessors;
151 BEGIN {
152 @accessors =
154 "__pathdb",
155 "_dirtymark", # keeps track of the dirtymark of the recentfiles
156 "_havelostpathdb", # boolean
157 "_have_written_statusfile", # boolean
158 "_logfilefordone", # turns on _logfile on all DONE
159 # systems (disk intensive)
160 "_max_one_state", # when we have no time left but want
161 # at least get one file per
162 # iteration to avoid procrastination
163 "_principal_recentfile",
164 "_recentfiles",
165 "_rsync",
166 "_runstatusfile", # occasionally dumps all rfs
167 "_verbose", # internal variable for verbose setter/getter
168 "_verboselog", # internal variable for verboselog setter/getter
171 my @pod_lines =
172 split /\n/, <<'=cut'; push @accessors, grep {s/^=item\s+//} @pod_lines; }
174 =over 4
176 =item ignore_link_stat_errors
178 as in F:R:M:Recentfile
180 =item local
182 Option to specify the local principal file for operations with a local
183 collection of recentfiles.
185 =item localroot
187 as in F:R:M:Recentfile
189 =item max_files_per_connection
191 as in F:R:M:Recentfile
193 =item remote
195 The remote principal recentfile in rsync notation. E.g.
197 pause.perl.org::authors/RECENT.recent
199 =item remoteroot
201 as in F:R:M:Recentfile
203 =item remote_recentfile
205 Rsync address of the remote C<RECENT.recent> symlink or whichever name
206 the principal remote recentfile has.
208 =item rsync_options
210 Things like compress, links, times or checksums. Passed in to the
211 File::Rsync object used to run the mirror.
213 =item tempdir
215 as in F:R:M:Recentfile
217 =item ttl
219 Minimum time before fetching the principal recentfile again.
221 =back
223 =cut
225 use accessors @accessors;
227 =head1 METHODS
229 =head2 $arrayref = $obj->news ( %options )
231 Test this with:
233 perl -Ilib bin/rrr-news \
234 -after 1217200539 \
235 -max 12 \
236 -local /home/ftp/pub/PAUSE/authors/RECENT.recent
238 perl -Ilib bin/rrr-news \
239 -after 1217200539 \
240 -rsync=compress=1 \
241 -rsync=links=1 \
242 -localroot /home/ftp/pub/PAUSE/authors/ \
243 -remote pause.perl.org::authors/RECENT.recent
244 -verbose
246 Note: all parameters that can be passed to
247 File:Rsync:Mirror:Recentfile::recent_events() can also be specified
248 here.
250 Note: all data are kept in memory
252 =cut
254 sub news {
255 my($self, %opt) = @_;
256 my $local = $self->local;
257 unless ($local) {
258 if (my $remote = $self->remote) {
259 my $localroot;
260 if ($localroot = $self->localroot) {
261 # nice, they know what they are doing
262 } else {
263 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
265 } else {
266 die "Alert: neither local nor remote specified, cannot continue";
269 my $rfs = $self->recentfiles;
270 my $ret = [];
271 my $before;
272 for my $rf (@$rfs) {
273 my %locopt = %opt;
274 $locopt{before} = $before;
275 if ($opt{max}) {
276 $locopt{max} -= scalar @$ret;
277 last if $locopt{max} <= 0;
279 $locopt{info} = {};
280 my $res = $rf->recent_events(%locopt);
281 if (@$res){
282 push @$ret, @$res;
284 if ($opt{max} && scalar @$ret > $opt{max}) {
285 last;
287 if ($opt{after}){
288 if ( $locopt{info}{last} && _bigfloatlt($locopt{info}{last}{epoch},$opt{after}) ) {
289 last;
291 if ( _bigfloatgt($opt{after},$locopt{info}{first}{epoch}) ) {
292 last;
295 if (!@$res){
296 next;
298 $before = $res->[-1]{epoch};
299 $before = $opt{before} if $opt{before} && _bigfloatlt($opt{before},$before);
301 $ret;
304 =head2 overview ( %options )
306 returns a small table that summarizes the state of all recentfiles
307 collected in this Recent object.
309 $options{verbose}=1 increases the number of columns displayed.
311 Here is an example output:
313 Ival Cnt Max Min Span Util Cloud
314 1h 47 1225053014.38 1225049650.91 3363.47 93.4% ^ ^
315 6h 324 1225052939.66 1225033394.84 19544.82 90.5% ^ ^
316 1d 437 1225049651.53 1224966402.53 83248.99 96.4% ^ ^
317 1W 1585 1225039015.75 1224435339.46 603676.29 99.8% ^ ^
318 1M 5855 1225017376.65 1222428503.57 2588873.08 99.9% ^ ^
319 1Q 17066 1224578930.40 1216803512.90 7775417.50 100.0% ^ ^
320 1Y 15901 1223966162.56 1216766820.67 7199341.89 22.8% ^ ^
321 Z 9909 1223966162.56 1216766820.67 7199341.89 - ^ ^
323 I<Max> is the name of the interval.
325 I<Cnt> is the number of entries in this recentfile.
327 I<Max> is the highest(first) epoch in this recentfile, rounded.
329 I<Min> is the lowest(last) epoch in this recentfile, rounded.
331 I<Span> is the timespan currently covered, rounded.
333 I<Util> is I<Span> devided by the designated timespan of this
334 recentfile.
336 I<Cloud> is ascii art illustrating the sequence of the Max and Min
337 timestamps.
339 =cut
340 sub overview {
341 my($self,%options) = @_;
342 my $rfs = $self->recentfiles;
343 my(@s,%rank);
344 RECENTFILE: for my $rf (@$rfs) {
345 my $re=$rf->recent_events;
346 my $rfsummary;
347 if (@$re) {
348 my $span = $re->[0]{epoch}-$re->[-1]{epoch};
349 my $merged = $rf->merged;
350 $rfsummary =
352 "Ival",
353 $rf->interval,
354 "Cnt",
355 scalar @$re,
356 "Dirtymark",
357 $rf->dirtymark ? sprintf("%.2f",$rf->dirtymark) : "-",
358 "Produced",
359 sprintf ("%.2f", $rf->{ORIG}{Producers}{time}||0),
360 "Merged",
361 ($rf->interval eq "Z"
365 sprintf ("%.2f", $merged->{epoch} || 0)),
366 "Max",
367 sprintf ("%.2f", $re->[0]{epoch}),
368 "Min",
369 sprintf ("%.2f", $re->[-1]{epoch}),
370 "Span",
371 sprintf ("%.2f", $span),
372 "Util", # u9n:)
373 ($rf->interval eq "Z"
377 sprintf ("%5.1f%%", 100 * $span / $rf->interval_secs)
380 @rank{mapp {$b} grepp {$a =~ /^(Max|Min)$/} @$rfsummary} = ();
381 } else {
382 next RECENTFILE;
384 push @s, $rfsummary;
386 @rank{sort {$b <=> $a} keys %rank} = 1..keys %rank;
387 my $maxrank = max values %rank;
388 for my $rfsummary (@s) {
389 my $string = " " x $maxrank;
390 my @borders;
391 for my $ele (qw(Max Min)) {
392 my($r) = mapp {$b} grepp {$a eq $ele} @$rfsummary;
393 push @borders, $rank{$r}-1;
395 for ($borders[0],$borders[1]) {
396 substr($string,$_,1) = "^";
398 push @$rfsummary, "Cloud", $string;
400 unless ($options{verbose}) {
401 my %filter = map {($_=>1)} qw(Ival Cnt Max Min Span Util Cloud);
402 for (@s) {
403 $_ = [mapp {($a,$b)} grepp {!!$filter{$a}} @$_];
406 my @sprintf;
407 for (my $i = 0; $i <= $#{$s[0]}; $i+=2) {
408 my $maxlength = max ((map { length $_->[$i+1] } @s), length $s[0][$i]);
409 push @sprintf, "%" . $maxlength . "s";
411 my $sprintf = join " ", @sprintf;
412 $sprintf .= "\n";
413 my $headline = sprintf $sprintf, mapp {$a} @{$s[0]};
414 join "", $headline, map { sprintf $sprintf, mapp {$b} @$_ } @s;
417 =head2 _pathdb
419 Keeping track of already handled files. Currently it is a hash, will
420 probably become a database with its own accessors.
422 =cut
424 sub _pathdb {
425 my($self, $set) = @_;
426 if ($set) {
427 $self->__pathdb ($set);
429 my $pathdb = $self->__pathdb;
430 unless (defined $pathdb) {
431 $self->__pathdb(+{});
433 return $self->__pathdb;
436 =head2 $recentfile = $obj->principal_recentfile ()
438 returns the principal recentfile object of this tree.
440 =cut
441 # mirrors the recentfile and instantiates the recentfile object
442 sub _principal_recentfile_fromremote {
443 my($self) = @_;
444 # get the remote recentfile
445 my $rrfile = $self->remote or die "Alert: cannot construct a recentfile object without the 'remote' attribute";
446 my $splitter = qr{(.+)/([^/]*)};
447 my($remoteroot,$rfilename) = $rrfile =~ $splitter;
448 $self->remoteroot($remoteroot);
449 my($abslfile, $fh);
450 if (!defined $rfilename) {
451 die "Alert: Cannot resolve '$rrfile', does not match $splitter";
452 } elsif (not length $rfilename or $rfilename eq "RECENT.recent") {
453 ($abslfile,$rfilename,$fh) = $self->_principal_recentfile_fromremote_resosymlink($rfilename);
455 my @need_args =
457 "ignore_link_stat_errors",
458 "localroot",
459 "max_files_per_connection",
460 "remoteroot",
461 "rsync_options",
462 "tempdir",
463 "ttl",
464 "verbose",
465 "verboselog",
467 my $rf0;
468 unless ($abslfile) {
469 $rf0 = File::Rsync::Mirror::Recentfile->new (map {($_ => $self->$_)} @need_args);
470 $rf0->split_rfilename($rfilename);
471 $abslfile = $rf0->get_remote_recentfile_as_tempfile ();
473 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ( $abslfile );
474 $rf0->_current_tempfile ( $abslfile );
475 $rf0->_current_tempfile_fh ( $fh );
476 $rf0->_use_tempfile (1);
477 for my $override (@need_args) {
478 $rf0->$override ( $self->$override );
480 $rf0->is_slave (1);
481 return $rf0;
483 sub principal_recentfile {
484 my($self) = @_;
485 my $rf0 = $self->_principal_recentfile;
486 return $rf0 if defined $rf0;
487 my $local = $self->local;
488 if ($local) {
489 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ($local);
490 } else {
491 if (my $remote = $self->remote) {
492 my $localroot;
493 if ($localroot = $self->localroot) {
494 # nice, they know what they are doing
495 } else {
496 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
498 $rf0 = $self->_principal_recentfile_fromremote;
499 } else {
500 die "Alert: neither local nor remote specified, cannot continue";
503 $self->_principal_recentfile($rf0);
504 return $rf0;
507 =head2 $recentfiles_arrayref = $obj->recentfiles ()
509 returns a reference to the complete list of recentfile objects that
510 describe this tree. No guarantee is given that the represented
511 recentfiles exist or have been read. They are just bare objects.
513 =cut
515 sub recentfiles {
516 my($self) = @_;
517 my $rfs = $self->_recentfiles;
518 return $rfs if defined $rfs;
519 my $rf0 = $self->principal_recentfile;
520 my $pathdb = $self->_pathdb;
521 $rf0->_pathdb ($pathdb);
522 my $aggregator = $rf0->aggregator;
523 my @rf = $rf0;
524 for my $agg (@$aggregator) {
525 my $nrf = $rf0->_sparse_clone;
526 $nrf->interval ( $agg );
527 $nrf->have_mirrored ( 0 );
528 $nrf->_pathdb ( $pathdb );
529 push @rf, $nrf;
531 $self->_recentfiles(\@rf);
532 return \@rf;
535 =head2 $success = $obj->rmirror ( %options )
537 Mirrors all recentfiles of the I<remote> address working through all
538 of them, mirroring their contents.
540 Test this with:
542 use File::Rsync::Mirror::Recent;
543 my $rrr = File::Rsync::Mirror::Recent->new(
544 ignore_link_stat_errors => 1,
545 localroot => "/home/ftp/pub/PAUSE/authors",
546 remote => "pause.perl.org::authors/RECENT.recent",
547 max_files_per_connection => 5000,
548 rsync_options => {
549 compress => 1,
550 links => 1,
551 times => 1,
552 checksum => 0,
554 verbose => 1,
555 _runstatusfile => "recent-rmirror-state.yml",
556 _logfilefordone => "recent-rmirror-donelog.log",
558 $rrr->rmirror ( "skip-deletes" => 1, loop => 1 );
560 Or try without the loop parameter and write the loop yourself:
562 use File::Rsync::Mirror::Recent;
563 my @rrr;
564 for my $t ("authors","modules"){
565 my $rrr = File::Rsync::Mirror::Recent->new(
566 ignore_link_stat_errors => 1,
567 localroot => "/home/ftp/pub/PAUSE/$t",
568 remote => "pause.perl.org::$t/RECENT.recent",
569 max_files_per_connection => 512,
570 rsync_options => {
571 compress => 1,
572 links => 1,
573 times => 1,
574 checksum => 0,
576 verbose => 1,
577 _runstatusfile => "recent-rmirror-state-$t.yml",
578 _logfilefordone => "recent-rmirror-donelog-$t.log",
579 ttl => 5,
581 push @rrr, $rrr;
583 while (){
584 for my $rrr (@rrr){
585 $rrr->rmirror ( "skip-deletes" => 1 );
587 warn "sleeping 23\n"; sleep 23;
591 =cut
592 # _alluptodate is unused but at least it worked last time I needed it,
593 # so let us keep it around
594 sub _alluptodate {
595 my($self) = @_;
596 my $sdm = $self->_dirtymark;
597 return unless defined $sdm;
598 for my $rf (@{$self->recentfiles}) {
599 return if $rf->seeded;
600 my $rfdm = $rf->dirtymark;
601 return unless defined $rfdm;
602 return unless $rfdm eq $sdm;
603 my $done = $rf->done;
604 return unless defined $done;
605 my $done_intervals = $done->_intervals;
606 return if !defined $done_intervals;
607 # nonono, may be more than one, only covered it must be:
608 # return if @$done_intervals > 1;
609 my $minmax = $rf->minmax;
610 return unless defined $minmax;
611 return unless $done->covered(@$minmax{qw(max min)});
613 # $DB::single++;
614 return 1;
616 sub _fullseed {
617 my($self) = @_;
618 for ( @{$self->recentfiles} ) { $_->seed(1) }
620 sub rmirror {
621 my($self, %options) = @_;
623 my $rfs = $self->recentfiles;
625 $self->principal_recentfile->seed;
626 my $_sigint = sub {
627 # XXX exit gracefully (reminder)
630 # XXX needs accessor: warning, if set too low, we do nothing but
631 # mirror the principal!
632 my $minimum_time_per_loop = 20;
634 if (my $logfile = $self->_logfilefordone) {
635 for my $i (0..$#$rfs) {
636 $rfs->[$i]->done->_logfile($logfile);
639 if (my $dirtymark = $self->principal_recentfile->dirtymark) {
640 my $mydm = $self->_dirtymark;
641 if (!defined $mydm){
642 $self->_dirtymark($dirtymark);
643 } elsif ($dirtymark ne $mydm) {
644 if ($self->verbose) {
645 my $fh;
646 if (my $vl = $self->verboselog) {
647 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
648 } else {
649 $fh = \*STDERR;
651 print $fh "NewDirtymark: old[$mydm] new[$dirtymark]\n";
653 $self->_dirtymark($dirtymark);
656 my $rstfile = $self->runstatusfile;
657 unless ($self->_have_written_statusfile) {
658 $self->_rmirror_runstatusfile_write ($rstfile, \%options);
659 $self->_have_written_statusfile(1);
661 $self->_rmirror_loop($minimum_time_per_loop,\%options);
664 sub _rmirror_loop {
665 my($self,$minimum_time_per_loop,$options) = @_;
666 LOOP: while () {
667 my $ttleave = time + $minimum_time_per_loop;
668 my $rstfile = $self->runstatusfile;
669 my $otherproc = $self->_thaw_without_pathdb ($rstfile);
670 my $pid = fork;
671 if (! defined $pid) {
672 warn "Contention: $!";
673 sleep 0.25;
674 next LOOP;
675 } elsif ($pid) {
676 waitpid($pid,0);
677 } else {
678 $self = $self->thaw ($rstfile);
679 my $rfs = $self->recentfiles;
680 $self->principal_recentfile->seed;
681 RECENTFILE: for my $i (0..$#$rfs) {
682 my $rf = $rfs->[$i];
683 if (time > $ttleave) {
684 # Must make sure that one file can get fetched in any case
685 $self->_max_one_state(1);
687 if ($rf->seeded) {
688 $self->_rmirror_mirror ($i, $options);
689 } elsif ($rf->uptodate) {
690 if ($i < $#$rfs) {
691 $rfs->[$i+1]->done->merge($rf->done);
693 # no further seed necessary because "periodic" does it
694 next RECENTFILE;
696 WORKUNIT: while (time < $ttleave) {
697 if ($rf->uptodate) {
698 $self->_rmirror_sleep_per_connection ($i);
699 next RECENTFILE;
700 } else {
701 $self->_rmirror_mirror ($i, $options);
704 if ($self->_max_one_state) {
705 last RECENTFILE;
708 $self->_max_one_state(0);
709 my $exit = 0;
710 if ($rfs->[-1]->uptodate) {
711 $self->_rmirror_cleanup;
713 unless ($options->{loop}) {
714 $exit = 1;
716 $self->_rmirror_runstatusfile_write ($rstfile, $options);
717 exit if $exit;
718 last LOOP;
721 $otherproc = $self->_thaw_without_pathdb ($rstfile);
722 if (!$options->{loop} && $otherproc && $otherproc->recentfiles->[-1]->uptodate) {
723 last LOOP;
725 my $sleep = $ttleave - time;
726 if ($sleep > 0.01) {
727 $self->_rmirror_endofloop_sleep ($sleep);
728 } else {
729 # negative time not invented yet:)
734 sub _rmirror_mirror {
735 my($self, $i, $options) = @_;
736 my $rfs = $self->recentfiles;
737 my $rf = $rfs->[$i];
738 my %locopt = %$options;
739 if ($self->_max_one_state) {
740 $locopt{max} = 1;
742 $locopt{piecemeal} = 1;
743 $rf->mirror (%locopt);
744 if ($i==0) {
745 # we limit to 0 for the case that upstream is broken and has
746 # more than one timestamp (happened on PAUSE 200903)
747 if (my $dirtymark = $rf->dirtymark) {
748 my $mydm = $self->_dirtymark;
749 if (!defined $mydm or $dirtymark ne $mydm) {
750 $self->_dirtymark($dirtymark);
751 $self->_fullseed;
757 sub _rmirror_sleep_per_connection {
758 my($self, $i) = @_;
759 my $rfs = $self->recentfiles;
760 my $rf = $rfs->[$i];
761 my $sleep = $rf->sleep_per_connection;
762 $sleep = 0.42 unless defined $sleep;
763 Time::HiRes::sleep $sleep;
764 $rfs->[$i+1]->done->merge($rf->done) if $i < $#$rfs;
767 sub _rmirror_cleanup {
768 my($self) = @_;
769 my $pathdb = $self->_pathdb();
770 for my $k (keys %$pathdb) {
771 delete $pathdb->{$k};
773 my $rfs = $self->recentfiles;
774 for my $i (0..$#$rfs-1) {
775 my $thismerged = $rfs->[$i]->merged;
776 my $next = $rfs->[$i+1];
777 my $nextminmax = $next->minmax;
778 if (not defined $thismerged->{epoch} or _bigfloatlt($nextminmax->{max},$thismerged->{epoch})){
779 $next->seed;
784 =head2 $file = $obj->runstatusfile ($set)
786 Getter/setter for C<_runstatusfile> attribute. Defaults to a temporary
787 file created by C<File::Temp>. A status file is required for
788 C<rmirror> working. Since it may be interesting for debugging
789 purposes, you may want to specify a permanent file for this.
791 =cut
792 sub runstatusfile {
793 my($self,$set) = @_;
794 if (defined $set) {
795 $self->_runstatusfile ($set);
797 my $x = $self->_runstatusfile;
798 unless (defined $x) {
799 require File::Temp;
800 my $tfile = File::Temp->new
802 TEMPLATE => "Recent-XXXX",
803 TMPDIR => 1,
804 UNLINK => 0,
805 CLEANUP => 0,
806 SUFFIX => '.dat',
808 $self->_runstatusfile($tfile->filename);
810 return $self->_runstatusfile;
813 # unused code.... it was an oops, discovered the thaw() method too
814 # late, and starting writing this here....
815 sub _rmirror_runstatusfile_read {
816 my($self, $file) = @_;
818 require YAML::Syck;
819 my $start = time;
820 # XXX is locking useful here?
821 while (not mkdir "$file.lock") {
822 Time::HiRes::sleep 0.2;
823 warn "*** waiting for lock ***" if time - $start >= 3;
825 my $yml = YAML::Syck::LoadFile $file;
826 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
827 my $rself = $yml->{reduced_self};
828 my $rfs = $yml->{reduced_rfs};
829 # XXX bring them into self
832 sub _rmirror_runstatusfile_write {
833 my($self, $file, $options) = @_;
834 my $rself;
835 while (my($k,$v) = each %$self) {
836 next if $k =~ /^-(_principal_recentfile|_recentfiles)$/;
837 $rself->{$k} = $v;
839 my $rfs = $self->recentfiles;
840 my $rrfs;
841 for my $i (0..$#$rfs) {
842 my $rf = $rfs->[$i];
843 while (my($k,$v) = each %$rf) {
844 next if $k =~ /^-(_current_tempfile_fh|_pathdb|_rsync)$/;
845 $rrfs->[$i]{$k} = $rfs->[$i]{$k};
848 require YAML::Syck;
849 my $start = time;
850 while (not mkdir "$file.lock") {
851 Time::HiRes::sleep 0.15;
852 warn "*** waiting for lock ***" if time - $start >= 3;
854 YAML::Syck::DumpFile
856 "$file.new",
858 options => $options,
859 time => time,
860 reduced_rfs => $rrfs,
861 reduced_self => $rself,
863 rename "$file.new", $file or die "Could not rename: $!";
864 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
867 sub _rmirror_endofloop_sleep {
868 my($self, $sleep) = @_;
869 if ($self->verbose) {
870 my $fh;
871 if (my $vl = $self->verboselog) {
872 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
873 } else {
874 $fh = \*STDERR;
876 printf $fh
878 "Dorm %d (%s secs)\n",
879 time,
880 $sleep,
882 sleep $sleep;
886 # it returns two things: abslfile and rfilename. But the abslfile is
887 # undef when the rfilename ends in .recent. A weird interface, my
888 # friend.
889 sub _principal_recentfile_fromremote_resosymlink {
890 my($self, $rfilename) = @_;
891 $rfilename = "RECENT.recent" unless length $rfilename;
892 my $abslfile = undef;
893 my $fh;
894 if ($rfilename =~ /\.recent$/) {
895 # may be a file *or* a symlink,
896 ($abslfile,$fh) = $self->_fetch_as_tempfile ($rfilename);
897 while (-l $abslfile) {
898 my $symlink = readlink $abslfile;
899 if ($symlink =~ m|/|) {
900 die "FIXME: filenames containing '/' not supported, got '$symlink'";
902 my $localrfile = File::Spec->catfile($self->localroot, $rfilename);
903 if (-e $localrfile) {
904 my $old_symlink = readlink $localrfile;
905 if ($old_symlink eq $symlink) {
906 unlink $abslfile or die "Cannot unlink '$abslfile': $!";
907 } else {
908 unlink $localrfile; # may fail
909 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
911 } else {
912 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
914 ($abslfile,$fh) = $self->_fetch_as_tempfile ($symlink);
917 return ($abslfile, $rfilename, $fh);
920 # takes a basename, returns an absolute name, does not delete the
921 # file, throws the $fh away. Caller must rename or unlink
923 # XXX needs to activate the fh in the rf0 so that it is able to unlink
924 # the file. I would like that the file is used immediately by $rf0
925 sub _fetch_as_tempfile {
926 my($self, $rfile) = @_;
927 my($suffix) = $rfile =~ /(\.[^\.]+)$/;
928 $suffix = "" unless defined $suffix;
929 my $fh = File::Temp->new
930 (TEMPLATE => sprintf(".FRMRecent-%s-XXXX",
931 $rfile,
933 DIR => $self->tempdir || $self->localroot,
934 SUFFIX => $suffix,
935 UNLINK => 0,
937 my $rsync;
938 unless ($rsync = File::Rsync->new($self->rsync_options)) {
939 require Carp;
940 Carp::confess(YAML::Syck::Dump($self->rsync_options));
942 my $dst = $fh->filename;
943 local($ENV{LANG}) = "C";
944 $rsync->exec
946 src => join("/",$self->remoteroot,$rfile),
947 dst => $dst,
948 ) or die "Could not mirror '$rfile' to $fh\: ".join(" ",$rsync->err);
949 unless (-l $dst) {
950 my $mode = 0644;
951 chmod $mode, $dst or die "Could not chmod $mode '$dst': $!";
953 return($dst,$fh);
956 =head2 $verbose = $obj->verbose ( $set )
958 Getter/setter method to set verbosity for this F:R:M:Recent object and
959 all associated Recentfile objects.
961 =cut
962 sub verbose {
963 my($self,$set) = @_;
964 if (defined $set) {
965 for ( @{$self->recentfiles} ) { $_->verbose($set) }
966 $self->_verbose ($set);
968 my $x = $self->_verbose;
969 unless (defined $x) {
970 $x = 0;
971 $self->_verbose ($x);
973 return $x;
977 =head2 my $vl = $obj->verboselog ( $set )
979 Getter/setter method for the path to the logfile to write verbose
980 progress information to.
982 Note: This is a primitive stop gap solution to get simple verbose
983 logging working. The program still sends error messages to STDERR.
984 Switching to Log4perl or similar is probably the way to go. TBD.
986 =cut
987 sub verboselog {
988 my($self,$set) = @_;
989 if (defined $set) {
990 for ( @{$self->recentfiles} ) { $_->verboselog($set) }
991 $self->_verboselog ($set);
993 my $x = $self->_verboselog;
994 unless (defined $x) {
995 $x = 0;
996 $self->_verboselog ($x);
998 return $x;
1001 =head1 THE ARCHITECTURE OF A COLLECTION OF RECENTFILES
1003 The idea is that we want to have a short file that records really
1004 recent changes. So that a fresh mirror can be kept fresh as long as
1005 the connectivity is given. Then we want longer files that record the
1006 history before. So when the mirror falls behind the update period
1007 reflected in the shortest file, it can complement the list of recent
1008 file events with the next one. And if this is not long enough we want
1009 another one, again a bit longer. And we want one that completes the
1010 history back to the oldest file. The index files do contain the
1011 complete list of current files. The longer a period covered by an
1012 index file is gone the less often the index file is updated. For
1013 practical reasons adjacent files will often overlap a bit but this is
1014 neither necessary nor enforced. That's the basic idea. The following
1015 example represents a tree that has a few updates every day:
1017 RECENT.recent -> RECENT-1h.yaml
1018 RECENT-1h.yaml
1019 RECENT-6h.yaml
1020 RECENT-1d.yaml
1021 RECENT-1M.yaml
1022 RECENT-1W.yaml
1023 RECENT-1Q.yaml
1024 RECENT-1Y.yaml
1025 RECENT-Z.yaml
1027 The first file is the principal file, in so far it is the one that is
1028 written first after a filesystem change. Usually a symlink links to it
1029 with a filename that has the same filenameroot and the suffix
1030 C<.recent>. On systems that do not support symlinks there is a plain
1031 copy maintained instead.
1033 The last file, the Z file, contains the complementary files that are
1034 in none of the other files. It may contain C<delete> events but often
1035 C<delete> events are discarded at the transition to the Z file.
1037 =head2 THE INDIVIDUAL RECENTFILE
1039 A I<recentfile> consists of a hash that has two keys: C<meta> and
1040 C<recent>. The C<meta> part has metadata and the C<recent> part has a
1041 list of fileobjects.
1043 =head2 THE META PART
1045 Here we find things that are pretty much self explaining: all
1046 lowercase attributes are accessors and as such explained in the
1047 manpages. The uppercase attribute C<Producers> contains version
1048 information about involved software components. Nothing to worry about
1049 as I believe.
1051 =head2 THE RECENT PART
1053 This is the interesting part. Every entry refers to some filesystem
1054 change (with path, epoch, type).
1056 The I<epoch> value is the point in time when some change was
1057 I<registered> but can be set to arbitrary values. Do not be tempted to
1058 believe that the entry has a direct relation to something like
1059 modification time or change time on the filesystem level. They are not
1060 reflecting release dates. (If you want exact release dates: Barbie is
1061 providing a database of them. See
1062 http://use.perl.org/~barbie/journal/37907).
1064 All these entries can be devided into two types (denoted by the
1065 I<type> attribute): C<new>s and C<delete>s. Changes and creations are
1066 C<new>s. Deletes are C<delete>s.
1068 Besides an I<epoch> and a I<type> attribute we find a third one:
1069 I<path>. This path is relative to the directory we find the
1070 I<recentfile> in.
1072 The order of the entries in the I<recentfile> is by decreasing epoch
1073 attribute. These are unique floating point numbers. When the server
1074 has ntp running correctly, then the timestamps are usually reflecting
1075 a real epoch. If time is running backwards, we trump the system epoch
1076 with strictly monotonically increasing floating point timestamps and
1077 guarantee they are unique.
1079 =head1 CORRUPTION AND RECOVERY
1081 If the origin host breaks the promise to deliver consistent and
1082 complete I<recentfiles> then it must update its C<dirtymark> and all
1083 slaves must discard what they cosider the truth. In the worst case
1084 that something goes wrong despite the dirtymark mechanism the way back
1085 to sanity can always be achieved through traditional rsyncing between
1086 the hosts.
1088 =head1 BACKGROUND
1090 This is about speeding up rsync operation on large trees. Uses a small
1091 metadata cocktail and pull technology.
1093 rersyncrecent solves this problem with a couple of (usually 2-10)
1094 lightweight index files which cover different overlapping time
1095 intervals. The master writes these files and the clients/slaves can
1096 construct the full tree from the information contained in them. The
1097 most recent index file usually covers the last seconds or minutes or
1098 hours of the tree and depending on the needs, slaves can rsync every
1099 few seconds or minutes and then bring their trees in full sync.
1101 The rersyncrecent model was developed for CPAN but as it is both
1102 convenient and economic it is also a general purpose solution. I'm
1103 looking forward to see a CPAN backbone that is only a few seconds
1104 behind PAUSE.
1106 =head2 NON-COMPETITORS
1108 File::Mirror JWU/File-Mirror/File-Mirror-0.10.tar.gz only local trees
1109 Mirror::YAML ADAMK/Mirror-YAML-0.03.tar.gz some sort of inner circle
1110 Net::DownloadMirror KNORR/Net-DownloadMirror-0.04.tar.gz FTP sites and stuff
1111 Net::MirrorDir KNORR/Net-MirrorDir-0.05.tar.gz dito
1112 Net::UploadMirror KNORR/Net-UploadMirror-0.06.tar.gz dito
1113 Pushmi::Mirror CLKAO/Pushmi-v1.0.0.tar.gz something SVK
1115 rsnapshot www.rsnapshot.org focus on backup
1116 csync www.csync.org more like unison
1117 multi-rsync sourceforge 167893 lan push to many
1118 chasm chasmd.org per-directory manifests
1120 =head2 COMPETITORS
1122 The problem to solve which clusters and ftp mirrors and otherwise
1123 replicated datasets like CPAN share: how to transfer only a minimum
1124 amount of data to determine the diff between two hosts.
1126 Normally it takes a long time to determine the diff itself before it
1127 can be transferred. Known solutions at the time of this writing are
1128 csync2, and rsync 3 batch mode.
1130 For many years the best solution was B<csync2> which solves the
1131 problem by maintaining a sqlite database on both ends and talking a
1132 highly sophisticated protocol to quickly determine which files to send
1133 and which to delete at any given point in time. Csync2 is often
1134 inconvenient because it is push technology and the act of syncing
1135 demands quite an intimate relationship between the sender and the
1136 receiver. This is hard to achieve in an environment of loosely coupled
1137 sites where the number of sites is large or connections are unreliable
1138 or network topology is changing.
1140 B<Rsync 3 batch mode> works around these problems by providing
1141 rsync-able batch files which allow receiving nodes to replay the
1142 history of the other nodes. This reduces the need to have an
1143 incestuous relation but it has the disadvantage that these batch files
1144 replicate the contents of the involved files. This seems inappropriate
1145 when the nodes already have a means of communicating over rsync.
1147 =head2 HONORABLE MENTION
1149 B<instantmirror> at https://fedorahosted.org/InstantMirror/ is an
1150 ambitious project that tries to combine various technologies (squid,
1151 bittorrent) to overcome the current slowness with the main focus on
1152 fedora. It's been founded in 2009-03 and at the time of this writing
1153 it is still a bit early to comment on.
1155 =head1 LIMITATIONS
1157 If the tree of the master server is changing faster than the bandwidth
1158 permits to mirror then additional protocols may need to be deployed.
1159 Certainly p2p/bittorrent can help in such situations because
1160 downloading sites help each other and bittorrent chunks large files
1161 into pieces.
1163 =head1 FUTURE DIRECTIONS
1165 Currently the origin server must keep track of injected and removed
1166 files. Should be supported by an inotify-based assistant.
1168 Convince other users outside the CPAN like
1169 http://fedoraproject.org/wiki/Infrastructure/Mirroring
1171 =head1 SEE ALSO
1173 L<File::Rsync::Mirror::Recentfile>,
1174 L<File::Rsync::Mirror::Recentfile::Done>,
1175 L<File::Rsync::Mirror::Recentfile::FakeBigFloat>
1177 =head1 BUGS
1179 Please report any bugs or feature requests through the web interface
1181 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=File-Rsync-Mirror-Recent>.
1182 I will be notified, and then you'll automatically be notified of
1183 progress on your bug as I make changes.
1185 =head1 SUPPORT
1187 You can find documentation for this module with the perldoc command.
1189 perldoc File::Rsync::Mirror::Recent
1191 You can also look for information at:
1193 =over 4
1195 =item * RT: CPAN's request tracker
1197 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=File-Rsync-Mirror-Recent>
1199 =item * AnnoCPAN: Annotated CPAN documentation
1201 L<http://annocpan.org/dist/File-Rsync-Mirror-Recent>
1203 =item * CPAN Ratings
1205 L<http://cpanratings.perl.org/d/File-Rsync-Mirror-Recent>
1207 =item * Search CPAN
1209 L<http://search.cpan.org/dist/File-Rsync-Mirror-Recent>
1211 =back
1214 =head1 ACKNOWLEDGEMENTS
1216 Thanks to RJBS for module-starter.
1218 =head1 AUTHOR
1220 Andreas König
1222 =head1 COPYRIGHT & LICENSE
1224 Copyright 2008, 2009 Andreas König.
1226 This program is free software; you can redistribute it and/or modify it
1227 under the same terms as Perl itself.
1230 =cut
1232 1; # End of File::Rsync::Mirror::Recent