ready for release
[rersyncrecent.git] / lib / File / Rsync / Mirror / Recent.pm
blob3c9d6888713b61b4ae8d11f0dde54d456368f314
1 package File::Rsync::Mirror::Recent;
3 # use warnings;
4 use strict;
5 use File::Rsync::Mirror::Recentfile;
7 =encoding utf-8
9 =head1 NAME
11 File::Rsync::Mirror::Recent - mirroring via rsync made efficient
13 =cut
15 package File::Rsync::Mirror::Recent;
17 use File::Basename qw(basename dirname fileparse);
18 use File::Copy qw(cp);
19 use File::Path qw(mkpath);
20 use File::Rsync;
21 use File::Rsync::Mirror::Recentfile::Done (); # at least needed by thaw()
22 use File::Rsync::Mirror::Recentfile::FakeBigFloat qw(:all);
23 use File::Temp;
24 use List::Pairwise qw(mapp grepp);
25 use List::Util qw(first max);
26 use Scalar::Util qw(blessed reftype);
27 use Storable;
28 use Time::HiRes qw();
29 use YAML::Syck;
31 use version; our $VERSION = qv('0.2.1');
33 =head1 SYNOPSIS
35 The documentation in here is normally not needed because the code is
36 considered to be run from several standalone programs. For a quick
37 overview, see the file README.mirrorcpan and the bin/ directory of the
38 distribution. For the architectural ideas see the section THE
39 ARCHITECTURE OF A COLLECTION OF RECENTFILES below.
41 File::Rsync::Mirror::Recent establishes a view on a collection of
42 File::Rsync::Mirror::Recentfile objects and provides abstractions
43 spanning multiple time intervals associated with those.
45 =head1 EXPORT
47 No exports.
49 =head1 CONSTRUCTORS
51 =head2 my $obj = CLASS->new(%hash)
53 Constructor. On every argument pair the key is a method name and the
54 value is an argument to that method name.
56 =cut
58 sub new {
59 my($class, @args) = @_;
60 my $self = bless {}, $class;
61 while (@args) {
62 my($method,$arg) = splice @args, 0, 2;
63 $self->$method($arg);
65 return $self;
68 =head2 my $obj = CLASS->thaw($statusfile)
70 Constructor from a statusfile left over from a previous
71 rmirror run. See also C<runstatusfile>.
73 =cut
75 sub _thaw_without_pathdb {
76 my($self,$file) = @_;
77 open my $fh, $file or die "Can't open '$file': $!";
78 local $/ = "\n";
79 my $in_pathdb = 0;
80 my $tfile = File::Temp->new
82 TEMPLATE => "Recent-thaw-XXXX",
83 TMPDIR => 1,
84 UNLINK => 0,
85 CLEANUP => 0,
86 SUFFIX => '.dat',
88 my $template_for_eop;
89 while (<$fh>) {
90 if ($in_pathdb) {
91 if (/$template_for_eop/) {
92 $in_pathdb = 0;
94 } elsif (/(\s+)-\s*__pathdb\s*:/) {
95 $in_pathdb = 1;
96 my $next_attr = sprintf "^%s\\S", " ?" x length($1);
97 $template_for_eop = qr{$next_attr};
99 print $tfile $_ unless $in_pathdb;
101 close $tfile or die "Could not close: $!";
102 my $return = $self->thaw($tfile->filename);
103 $return->_havelostpathdb(1);
104 unlink $tfile->filename;
105 return $return;
107 sub thaw {
108 my($self, $file) = @_;
109 die "thaw called without statusfile argument" unless defined $file;
110 unless (-e $file){
111 require Carp;
112 Carp::confess("Alert: statusfile '$file' not found");
114 require YAML::Syck;
115 my $start = time;
116 my $sleeptime = 0.02;
117 while (not mkdir "$file.lock") {
118 my $err = $!;
119 Time::HiRes::sleep $sleeptime;
120 my $waiting = time - $start;
121 if ($waiting >= 3){
122 warn "*** waiting ($waiting) for lock ($err) ***";
123 $sleeptime = 1;
126 my $size = -s $file;
127 my $serialized = YAML::Syck::LoadFile($file);
128 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
129 my $charged_self = $serialized->{reduced_self};
130 my $class = blessed $self;
131 bless $charged_self, $class;
132 my $rfs = $serialized->{reduced_rfs};
133 my $rfclass = $class . "file"; # "Recent" . "file"
134 my $pathdb = $charged_self->_pathdb;
135 for my $rf (@$rfs) {
136 bless $rf, $rfclass;
137 $rf->_pathdb($pathdb);
139 $charged_self->_recentfiles($rfs);
140 $charged_self->_principal_recentfile($rfs->[0]);
141 # die "FIXME: thaw all recentfiles from reduced_rfs into _recentfiles as well, watch out for pathdb and rsync";
142 return $charged_self;
145 =head1 ACCESSORS
147 =cut
149 my @accessors;
151 BEGIN {
152 @accessors =
154 "__pathdb",
155 "_dirtymark", # keeps track of the dirtymark of the recentfiles
156 "_havelostpathdb", # boolean
157 "_have_written_statusfile", # boolean
158 "_logfilefordone", # turns on _logfile on all DONE
159 # systems (disk intensive)
160 "_max_one_state", # when we have no time left but want
161 # at least get one file per
162 # iteration to avoid procrastination
163 "_principal_recentfile",
164 "_recentfiles",
165 "_rsync",
166 "_runstatusfile", # occasionally dumps all rfs
167 "_verbose", # internal variable for verbose setter/getter
168 "_verboselog", # internal variable for verboselog setter/getter
171 my @pod_lines =
172 split /\n/, <<'=cut'; push @accessors, grep {s/^=item\s+//} @pod_lines; }
174 =over 4
176 =item ignore_link_stat_errors
178 as in F:R:M:Recentfile
180 =item local
182 Option to specify the local principal file for operations with a local
183 collection of recentfiles.
185 =item localroot
187 as in F:R:M:Recentfile
189 =item max_files_per_connection
191 as in F:R:M:Recentfile
193 =item remote
195 The remote principal recentfile in rsync notation. E.g.
197 pause.perl.org::authors/RECENT.recent
199 =item remoteroot
201 as in F:R:M:Recentfile
203 =item remote_recentfile
205 Rsync address of the remote C<RECENT.recent> symlink or whichever name
206 the principal remote recentfile has.
208 =item rsync_options
210 Things like compress, links, times or checksums. Passed in to the
211 File::Rsync object used to run the mirror.
213 =item tempdir
215 as in F:R:M:Recentfile
217 =item ttl
219 Minimum time before fetching the principal recentfile again.
221 =back
223 =cut
225 use accessors @accessors;
227 =head1 METHODS
229 =head2 $arrayref = $obj->news ( %options )
231 Test this with:
233 perl -Ilib bin/rrr-news \
234 -after 1217200539 \
235 -max 12 \
236 -local /home/ftp/pub/PAUSE/authors/RECENT.recent
238 perl -Ilib bin/rrr-news \
239 -after 1217200539 \
240 -rsync=compress=1 \
241 -rsync=links=1 \
242 -localroot /home/ftp/pub/PAUSE/authors/ \
243 -remote pause.perl.org::authors/RECENT.recent
244 -verbose
246 All parameters that can be passed to
247 File:Rsync:Mirror:Recentfile::recent_events() can also be specified
248 here.
250 One additional option is supported. If C<$Options{callback}> is
251 specified, it must be a subref. This sub is called whenever one chunk
252 of events is found. The first argument to the callback is a reference
253 to the currently accumulated array of events.
255 Note: all data are kept in memory.
257 =cut
259 sub news {
260 my($self, %opt) = @_;
261 my $local = $self->local;
262 unless ($local) {
263 if (my $remote = $self->remote) {
264 my $localroot;
265 if ($localroot = $self->localroot) {
266 # nice, they know what they are doing
267 } else {
268 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
270 } else {
271 die "Alert: neither local nor remote specified, cannot continue";
274 my $rfs = $self->recentfiles;
275 my $ret = [];
276 my $before;
277 for my $rf (@$rfs) {
278 my %locopt = %opt;
279 $locopt{before} = $before;
280 if ($opt{max}) {
281 $locopt{max} -= scalar @$ret;
282 last if $locopt{max} <= 0;
284 $locopt{info} = {};
285 my $res = $rf->recent_events(%locopt);
286 if (@$res){
287 push @$ret, @$res;
289 if ($opt{max} && scalar @$ret > $opt{max}) {
290 last;
292 if ($opt{after}){
293 if ( $locopt{info}{last} && _bigfloatlt($locopt{info}{last}{epoch},$opt{after}) ) {
294 last;
296 if ( _bigfloatgt($opt{after},$locopt{info}{first}{epoch}) ) {
297 last;
300 if (!@$res){
301 next;
303 $before = $res->[-1]{epoch};
304 $before = $opt{before} if $opt{before} && _bigfloatlt($opt{before},$before);
305 if (my $sub = $opt{callback}) {
306 $sub->($ret);
309 $ret;
312 =head2 overview ( %options )
314 returns a small table that summarizes the state of all recentfiles
315 collected in this Recent object.
317 $options{verbose}=1 increases the number of columns displayed.
319 Here is an example output:
321 Ival Cnt Max Min Span Util Cloud
322 1h 47 1225053014.38 1225049650.91 3363.47 93.4% ^ ^
323 6h 324 1225052939.66 1225033394.84 19544.82 90.5% ^ ^
324 1d 437 1225049651.53 1224966402.53 83248.99 96.4% ^ ^
325 1W 1585 1225039015.75 1224435339.46 603676.29 99.8% ^ ^
326 1M 5855 1225017376.65 1222428503.57 2588873.08 99.9% ^ ^
327 1Q 17066 1224578930.40 1216803512.90 7775417.50 100.0% ^ ^
328 1Y 15901 1223966162.56 1216766820.67 7199341.89 22.8% ^ ^
329 Z 9909 1223966162.56 1216766820.67 7199341.89 - ^ ^
331 I<Max> is the name of the interval.
333 I<Cnt> is the number of entries in this recentfile.
335 I<Max> is the highest(first) epoch in this recentfile, rounded.
337 I<Min> is the lowest(last) epoch in this recentfile, rounded.
339 I<Span> is the timespan currently covered, rounded.
341 I<Util> is I<Span> devided by the designated timespan of this
342 recentfile.
344 I<Cloud> is ascii art illustrating the sequence of the Max and Min
345 timestamps.
347 =cut
348 sub overview {
349 my($self,%options) = @_;
350 my $rfs = $self->recentfiles;
351 my(@s,%rank);
352 RECENTFILE: for my $rf (@$rfs) {
353 my $re=$rf->recent_events;
354 my $rfsummary;
355 if (@$re) {
356 my $span = $re->[0]{epoch}-$re->[-1]{epoch};
357 my $merged = $rf->merged;
358 $rfsummary =
360 "Ival",
361 $rf->interval,
362 "Cnt",
363 scalar @$re,
364 "Dirtymark",
365 $rf->dirtymark ? sprintf("%.2f",$rf->dirtymark) : "-",
366 "Produced",
367 sprintf ("%.2f", $rf->{ORIG}{Producers}{time}||0),
368 "Merged",
369 ($rf->interval eq "Z"
373 sprintf ("%.2f", $merged->{epoch} || 0)),
374 "Max",
375 sprintf ("%.2f", $re->[0]{epoch}),
376 "Min",
377 sprintf ("%.2f", $re->[-1]{epoch}),
378 "Span",
379 sprintf ("%.2f", $span),
380 "Util", # u9n:)
381 ($rf->interval eq "Z"
385 sprintf ("%5.1f%%", 100 * $span / $rf->interval_secs)
388 @rank{mapp {$b} grepp {$a =~ /^(Max|Min)$/} @$rfsummary} = ();
389 } else {
390 next RECENTFILE;
392 push @s, $rfsummary;
394 @rank{sort {$b <=> $a} keys %rank} = 1..keys %rank;
395 my $maxrank = max values %rank;
396 for my $rfsummary (@s) {
397 my $string = " " x $maxrank;
398 my @borders;
399 for my $ele (qw(Max Min)) {
400 my($r) = mapp {$b} grepp {$a eq $ele} @$rfsummary;
401 push @borders, $rank{$r}-1;
403 for ($borders[0],$borders[1]) {
404 substr($string,$_,1) = "^";
406 push @$rfsummary, "Cloud", $string;
408 unless ($options{verbose}) {
409 my %filter = map {($_=>1)} qw(Ival Cnt Max Min Span Util Cloud);
410 for (@s) {
411 $_ = [mapp {($a,$b)} grepp {!!$filter{$a}} @$_];
414 my @sprintf;
415 for (my $i = 0; $i <= $#{$s[0]}; $i+=2) {
416 my $maxlength = max ((map { length $_->[$i+1] } @s), length $s[0][$i]);
417 push @sprintf, "%" . $maxlength . "s";
419 my $sprintf = join " ", @sprintf;
420 $sprintf .= "\n";
421 my $headline = sprintf $sprintf, mapp {$a} @{$s[0]};
422 join "", $headline, map { sprintf $sprintf, mapp {$b} @$_ } @s;
425 =head2 _pathdb
427 Keeping track of already handled files. Currently it is a hash, will
428 probably become a database with its own accessors.
430 =cut
432 sub _pathdb {
433 my($self, $set) = @_;
434 if ($set) {
435 $self->__pathdb ($set);
437 my $pathdb = $self->__pathdb;
438 unless (defined $pathdb) {
439 $self->__pathdb(+{});
441 return $self->__pathdb;
444 =head2 $recentfile = $obj->principal_recentfile ()
446 returns the principal recentfile object of this tree.
448 =cut
449 # mirrors the recentfile and instantiates the recentfile object
450 sub _principal_recentfile_fromremote {
451 my($self) = @_;
452 # get the remote recentfile
453 my $rrfile = $self->remote or die "Alert: cannot construct a recentfile object without the 'remote' attribute";
454 my $splitter = qr{(.+)/([^/]*)};
455 my($remoteroot,$rfilename) = $rrfile =~ $splitter;
456 $self->remoteroot($remoteroot);
457 my($abslfile, $fh);
458 if (!defined $rfilename) {
459 die "Alert: Cannot resolve '$rrfile', does not match $splitter";
460 } elsif (not length $rfilename or $rfilename eq "RECENT.recent") {
461 ($abslfile,$rfilename,$fh) = $self->_principal_recentfile_fromremote_resosymlink($rfilename);
463 my @need_args =
465 "ignore_link_stat_errors",
466 "localroot",
467 "max_files_per_connection",
468 "remoteroot",
469 "rsync_options",
470 "tempdir",
471 "ttl",
472 "verbose",
473 "verboselog",
475 my $rf0;
476 unless ($abslfile) {
477 $rf0 = File::Rsync::Mirror::Recentfile->new (map {($_ => $self->$_)} @need_args);
478 $rf0->split_rfilename($rfilename);
479 $abslfile = $rf0->get_remote_recentfile_as_tempfile ();
481 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ( $abslfile );
482 $rf0->_current_tempfile ( $abslfile );
483 $rf0->_current_tempfile_fh ( $fh );
484 $rf0->_use_tempfile (1);
485 for my $override (@need_args) {
486 $rf0->$override ( $self->$override );
488 $rf0->is_slave (1);
489 return $rf0;
491 sub principal_recentfile {
492 my($self) = @_;
493 my $rf0 = $self->_principal_recentfile;
494 return $rf0 if defined $rf0;
495 my $local = $self->local;
496 if ($local) {
497 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ($local);
498 } else {
499 if (my $remote = $self->remote) {
500 my $localroot;
501 if ($localroot = $self->localroot) {
502 # nice, they know what they are doing
503 } else {
504 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
506 $rf0 = $self->_principal_recentfile_fromremote;
507 } else {
508 die "Alert: neither local nor remote specified, cannot continue";
511 $self->_principal_recentfile($rf0);
512 return $rf0;
515 =head2 $recentfiles_arrayref = $obj->recentfiles ()
517 returns a reference to the complete list of recentfile objects that
518 describe this tree. No guarantee is given that the represented
519 recentfiles exist or have been read. They are just bare objects.
521 =cut
523 sub recentfiles {
524 my($self) = @_;
525 my $rfs = $self->_recentfiles;
526 return $rfs if defined $rfs;
527 my $rf0 = $self->principal_recentfile;
528 my $pathdb = $self->_pathdb;
529 $rf0->_pathdb ($pathdb);
530 my $aggregator = $rf0->aggregator;
531 my @rf = $rf0;
532 for my $agg (@$aggregator) {
533 my $nrf = $rf0->_sparse_clone;
534 $nrf->interval ( $agg );
535 $nrf->have_mirrored ( 0 );
536 $nrf->_pathdb ( $pathdb );
537 push @rf, $nrf;
539 $self->_recentfiles(\@rf);
540 return \@rf;
543 =head2 $success = $obj->rmirror ( %options )
545 Mirrors all recentfiles of the I<remote> address working through all
546 of them, mirroring their contents.
548 Test this with:
550 use File::Rsync::Mirror::Recent;
551 my $rrr = File::Rsync::Mirror::Recent->new(
552 ignore_link_stat_errors => 1,
553 localroot => "/home/ftp/pub/PAUSE/authors",
554 remote => "pause.perl.org::authors/RECENT.recent",
555 max_files_per_connection => 5000,
556 rsync_options => {
557 compress => 1,
558 links => 1,
559 times => 1,
560 checksum => 0,
562 verbose => 1,
563 _runstatusfile => "recent-rmirror-state.yml",
564 _logfilefordone => "recent-rmirror-donelog.log",
566 $rrr->rmirror ( "skip-deletes" => 1, loop => 1 );
568 Or try without the loop parameter and write the loop yourself:
570 use File::Rsync::Mirror::Recent;
571 my @rrr;
572 for my $t ("authors","modules"){
573 my $rrr = File::Rsync::Mirror::Recent->new(
574 ignore_link_stat_errors => 1,
575 localroot => "/home/ftp/pub/PAUSE/$t",
576 remote => "pause.perl.org::$t/RECENT.recent",
577 max_files_per_connection => 512,
578 rsync_options => {
579 compress => 1,
580 links => 1,
581 times => 1,
582 checksum => 0,
584 verbose => 1,
585 _runstatusfile => "recent-rmirror-state-$t.yml",
586 _logfilefordone => "recent-rmirror-donelog-$t.log",
587 ttl => 5,
589 push @rrr, $rrr;
591 while (){
592 for my $rrr (@rrr){
593 $rrr->rmirror ( "skip-deletes" => 1 );
595 warn "sleeping 23\n"; sleep 23;
599 =cut
600 # _alluptodate is unused but at least it worked last time I needed it,
601 # so let us keep it around
602 sub _alluptodate {
603 my($self) = @_;
604 my $sdm = $self->_dirtymark;
605 return unless defined $sdm;
606 for my $rf (@{$self->recentfiles}) {
607 return if $rf->seeded;
608 my $rfdm = $rf->dirtymark;
609 return unless defined $rfdm;
610 return unless $rfdm eq $sdm;
611 my $done = $rf->done;
612 return unless defined $done;
613 my $done_intervals = $done->_intervals;
614 return if !defined $done_intervals;
615 # nonono, may be more than one, only covered it must be:
616 # return if @$done_intervals > 1;
617 my $minmax = $rf->minmax;
618 return unless defined $minmax;
619 return unless $done->covered(@$minmax{qw(max min)});
621 # $DB::single++;
622 return 1;
624 sub _fullseed {
625 my($self) = @_;
626 for ( @{$self->recentfiles} ) { $_->seed(1) }
628 sub rmirror {
629 my($self, %options) = @_;
631 my $rfs = $self->recentfiles;
633 $self->principal_recentfile->seed;
634 my $_sigint = sub {
635 # XXX exit gracefully (reminder)
638 # XXX needs accessor: warning, if set too low, we do nothing but
639 # mirror the principal!
640 my $minimum_time_per_loop = 20;
642 if (my $logfile = $self->_logfilefordone) {
643 for my $i (0..$#$rfs) {
644 $rfs->[$i]->done->_logfile($logfile);
647 if (my $dirtymark = $self->principal_recentfile->dirtymark) {
648 my $mydm = $self->_dirtymark;
649 if (!defined $mydm){
650 $self->_dirtymark($dirtymark);
651 } elsif ($dirtymark ne $mydm) {
652 if ($self->verbose) {
653 my $fh;
654 if (my $vl = $self->verboselog) {
655 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
656 } else {
657 $fh = \*STDERR;
659 print $fh "NewDirtymark: old[$mydm] new[$dirtymark]\n";
661 $self->_dirtymark($dirtymark);
664 my $rstfile = $self->runstatusfile;
665 unless ($self->_have_written_statusfile) {
666 $self->_rmirror_runstatusfile_write ($rstfile, \%options);
667 $self->_have_written_statusfile(1);
669 $self->_rmirror_loop($minimum_time_per_loop,\%options);
672 sub _rmirror_loop {
673 my($self,$minimum_time_per_loop,$options) = @_;
674 LOOP: while () {
675 my $ttleave = time + $minimum_time_per_loop;
676 my $rstfile = $self->runstatusfile;
677 my $otherproc = $self->_thaw_without_pathdb ($rstfile);
678 my $pid = fork;
679 if (! defined $pid) {
680 warn "Contention: $!";
681 sleep 0.25;
682 next LOOP;
683 } elsif ($pid) {
684 waitpid($pid,0);
685 } else {
686 $self = $self->thaw ($rstfile);
687 my $rfs = $self->recentfiles;
688 $self->principal_recentfile->seed;
689 RECENTFILE: for my $i (0..$#$rfs) {
690 my $rf = $rfs->[$i];
691 if (time > $ttleave) {
692 # Must make sure that one file can get fetched in any case
693 $self->_max_one_state(1);
695 if ($rf->seeded) {
696 $self->_rmirror_mirror ($i, $options);
697 } elsif ($rf->uptodate) {
698 if ($i < $#$rfs) {
699 $rfs->[$i+1]->done->merge($rf->done);
701 # no further seed necessary because "periodic" does it
702 next RECENTFILE;
704 WORKUNIT: while (time < $ttleave) {
705 if ($rf->uptodate) {
706 $self->_rmirror_sleep_per_connection ($i);
707 next RECENTFILE;
708 } else {
709 $self->_rmirror_mirror ($i, $options);
712 if ($self->_max_one_state) {
713 last RECENTFILE;
716 $self->_max_one_state(0);
717 my $exit = 0;
718 if ($rfs->[-1]->uptodate) {
719 $self->_rmirror_cleanup;
721 unless ($options->{loop}) {
722 $exit = 1;
724 $self->_rmirror_runstatusfile_write ($rstfile, $options);
725 exit if $exit;
726 last LOOP;
729 $otherproc = $self->_thaw_without_pathdb ($rstfile);
730 if (!$options->{loop} && $otherproc && $otherproc->recentfiles->[-1]->uptodate) {
731 last LOOP;
733 my $sleep = $ttleave - time;
734 if ($sleep > 0.01) {
735 $self->_rmirror_endofloop_sleep ($sleep);
736 } else {
737 # negative time not invented yet:)
742 sub _rmirror_mirror {
743 my($self, $i, $options) = @_;
744 my $rfs = $self->recentfiles;
745 my $rf = $rfs->[$i];
746 my %locopt = %$options;
747 if ($self->_max_one_state) {
748 $locopt{max} = 1;
750 $locopt{piecemeal} = 1;
751 $rf->mirror (%locopt);
752 if ($i==0) {
753 # we limit to 0 for the case that upstream is broken and has
754 # more than one timestamp (happened on PAUSE 200903)
755 if (my $dirtymark = $rf->dirtymark) {
756 my $mydm = $self->_dirtymark;
757 if (!defined $mydm or $dirtymark ne $mydm) {
758 $self->_dirtymark($dirtymark);
759 $self->_fullseed;
765 sub _rmirror_sleep_per_connection {
766 my($self, $i) = @_;
767 my $rfs = $self->recentfiles;
768 my $rf = $rfs->[$i];
769 my $sleep = $rf->sleep_per_connection;
770 $sleep = 0.42 unless defined $sleep;
771 Time::HiRes::sleep $sleep;
772 $rfs->[$i+1]->done->merge($rf->done) if $i < $#$rfs;
775 sub _rmirror_cleanup {
776 my($self) = @_;
777 my $pathdb = $self->_pathdb();
778 for my $k (keys %$pathdb) {
779 delete $pathdb->{$k};
781 my $rfs = $self->recentfiles;
782 for my $i (0..$#$rfs-1) {
783 my $thismerged = $rfs->[$i]->merged;
784 my $next = $rfs->[$i+1];
785 my $nextminmax = $next->minmax;
786 if (not defined $thismerged->{epoch} or _bigfloatlt($nextminmax->{max},$thismerged->{epoch})){
787 $next->seed;
792 =head2 $file = $obj->runstatusfile ($set)
794 Getter/setter for C<_runstatusfile> attribute. Defaults to a temporary
795 file created by C<File::Temp>. A status file is required for
796 C<rmirror> working. Since it may be interesting for debugging
797 purposes, you may want to specify a permanent file for this.
799 =cut
800 sub runstatusfile {
801 my($self,$set) = @_;
802 if (defined $set) {
803 $self->_runstatusfile ($set);
805 my $x = $self->_runstatusfile;
806 unless (defined $x) {
807 require File::Temp;
808 my $tfile = File::Temp->new
810 TEMPLATE => "Recent-XXXX",
811 TMPDIR => 1,
812 UNLINK => 0,
813 CLEANUP => 0,
814 SUFFIX => '.dat',
816 $self->_runstatusfile($tfile->filename);
818 return $self->_runstatusfile;
821 # unused code.... it was an oops, discovered the thaw() method too
822 # late, and starting writing this here....
823 sub _rmirror_runstatusfile_read {
824 my($self, $file) = @_;
826 require YAML::Syck;
827 my $start = time;
828 # XXX is locking useful here?
829 while (not mkdir "$file.lock") {
830 Time::HiRes::sleep 0.2;
831 warn "*** waiting for lock ***" if time - $start >= 3;
833 my $yml = YAML::Syck::LoadFile $file;
834 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
835 my $rself = $yml->{reduced_self};
836 my $rfs = $yml->{reduced_rfs};
837 # XXX bring them into self
840 sub _rmirror_runstatusfile_write {
841 my($self, $file, $options) = @_;
842 my $rself;
843 while (my($k,$v) = each %$self) {
844 next if $k =~ /^-(_principal_recentfile|_recentfiles)$/;
845 $rself->{$k} = $v;
847 my $rfs = $self->recentfiles;
848 my $rrfs;
849 for my $i (0..$#$rfs) {
850 my $rf = $rfs->[$i];
851 while (my($k,$v) = each %$rf) {
852 next if $k =~ /^-(_current_tempfile_fh|_pathdb|_rsync)$/;
853 $rrfs->[$i]{$k} = $rfs->[$i]{$k};
856 require YAML::Syck;
857 my $start = time;
858 while (not mkdir "$file.lock") {
859 Time::HiRes::sleep 0.15;
860 warn "*** waiting for lock directory '$file.lock' ***" if time - $start >= 3;
862 YAML::Syck::DumpFile
864 "$file.new",
866 options => $options,
867 time => time,
868 reduced_rfs => $rrfs,
869 reduced_self => $rself,
871 rename "$file.new", $file or die "Could not rename: $!";
872 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
875 sub _rmirror_endofloop_sleep {
876 my($self, $sleep) = @_;
877 if ($self->verbose) {
878 my $fh;
879 if (my $vl = $self->verboselog) {
880 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
881 } else {
882 $fh = \*STDERR;
884 printf $fh
886 "Dorm %d (%s secs)\n",
887 time,
888 $sleep,
891 sleep $sleep;
894 # it returns two things: abslfile and rfilename. But the abslfile is
895 # undef when the rfilename ends in .recent. A weird interface, my
896 # friend.
897 sub _principal_recentfile_fromremote_resosymlink {
898 my($self, $rfilename) = @_;
899 $rfilename = "RECENT.recent" unless length $rfilename;
900 my $abslfile = undef;
901 my $fh;
902 if ($rfilename =~ /\.recent$/) {
903 # may be a file *or* a symlink,
904 ($abslfile,$fh) = $self->_fetch_as_tempfile ($rfilename);
905 while (-l $abslfile) {
906 my $symlink = readlink $abslfile;
907 if ($symlink =~ m|/|) {
908 die "FIXME: filenames containing '/' not supported, got '$symlink'";
910 my $localrfile = File::Spec->catfile($self->localroot, $rfilename);
911 if (-e $localrfile) {
912 my $old_symlink = readlink $localrfile;
913 if ($old_symlink eq $symlink) {
914 unlink $abslfile or die "Cannot unlink '$abslfile': $!";
915 } else {
916 unlink $localrfile; # may fail
917 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
919 } else {
920 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
922 ($abslfile,$fh) = $self->_fetch_as_tempfile ($symlink);
925 return ($abslfile, $rfilename, $fh);
928 # takes a basename, returns an absolute name, does not delete the
929 # file, throws the $fh away. Caller must rename or unlink
931 # XXX needs to activate the fh in the rf0 so that it is able to unlink
932 # the file. I would like that the file is used immediately by $rf0
933 sub _fetch_as_tempfile {
934 my($self, $rfile) = @_;
935 my($suffix) = $rfile =~ /(\.[^\.]+)$/;
936 $suffix = "" unless defined $suffix;
937 my $fh = File::Temp->new
938 (TEMPLATE => sprintf(".FRMRecent-%s-XXXX",
939 $rfile,
941 DIR => $self->tempdir || $self->localroot,
942 SUFFIX => $suffix,
943 UNLINK => 0,
945 my $rsync;
946 unless ($rsync = File::Rsync->new($self->rsync_options)) {
947 require Carp;
948 Carp::confess(YAML::Syck::Dump($self->rsync_options));
950 my $dst = $fh->filename;
951 local($ENV{LANG}) = "C";
952 $rsync->exec
954 src => join("/",$self->remoteroot,$rfile),
955 dst => $dst,
956 ) or die "Could not mirror '$rfile' to $fh\: ".join(" ",$rsync->err);
957 unless (-l $dst) {
958 my $mode = 0644;
959 chmod $mode, $dst or die "Could not chmod $mode '$dst': $!";
961 return($dst,$fh);
964 =head2 $verbose = $obj->verbose ( $set )
966 Getter/setter method to set verbosity for this F:R:M:Recent object and
967 all associated Recentfile objects.
969 =cut
970 sub verbose {
971 my($self,$set) = @_;
972 if (defined $set) {
973 for ( @{$self->recentfiles} ) { $_->verbose($set) }
974 $self->_verbose ($set);
976 my $x = $self->_verbose;
977 unless (defined $x) {
978 $x = 0;
979 $self->_verbose ($x);
981 return $x;
985 =head2 my $vl = $obj->verboselog ( $set )
987 Getter/setter method for the path to the logfile to write verbose
988 progress information to.
990 Note: This is a primitive stop gap solution to get simple verbose
991 logging working. The program still sends error messages to STDERR.
992 Switching to Log4perl or similar is probably the way to go. TBD.
994 =cut
995 sub verboselog {
996 my($self,$set) = @_;
997 if (defined $set) {
998 for ( @{$self->recentfiles} ) { $_->verboselog($set) }
999 $self->_verboselog ($set);
1001 my $x = $self->_verboselog;
1002 unless (defined $x) {
1003 $x = 0;
1004 $self->_verboselog ($x);
1006 return $x;
1009 =head1 THE ARCHITECTURE OF A COLLECTION OF RECENTFILES
1011 The idea is that we want to have a short file that records really
1012 recent changes. So that a fresh mirror can be kept fresh as long as
1013 the connectivity is given. Then we want longer files that record the
1014 history before. So when the mirror falls behind the update period
1015 reflected in the shortest file, it can complement the list of recent
1016 file events with the next one. And if this is not long enough we want
1017 another one, again a bit longer. And we want one that completes the
1018 history back to the oldest file. The index files do contain the
1019 complete list of current files. The longer a period covered by an
1020 index file is gone the less often the index file is updated. For
1021 practical reasons adjacent files will often overlap a bit but this is
1022 neither necessary nor enforced. That's the basic idea. The following
1023 example represents a tree that has a few updates every day:
1025 RECENT.recent -> RECENT-1h.yaml
1026 RECENT-1h.yaml
1027 RECENT-6h.yaml
1028 RECENT-1d.yaml
1029 RECENT-1M.yaml
1030 RECENT-1W.yaml
1031 RECENT-1Q.yaml
1032 RECENT-1Y.yaml
1033 RECENT-Z.yaml
1035 The first file is the principal file, in so far it is the one that is
1036 written first after a filesystem change. Usually a symlink links to it
1037 with a filename that has the same filenameroot and the suffix
1038 C<.recent>. On systems that do not support symlinks there is a plain
1039 copy maintained instead.
1041 The last file, the Z file, contains the complementary files that are
1042 in none of the other files. It may contain C<delete> events but often
1043 C<delete> events are discarded at the transition to the Z file.
1045 =head2 THE INDIVIDUAL RECENTFILE
1047 A I<recentfile> consists of a hash that has two keys: C<meta> and
1048 C<recent>. The C<meta> part has metadata and the C<recent> part has a
1049 list of fileobjects.
1051 =head2 THE META PART
1053 Here we find things that are pretty much self explaining: all
1054 lowercase attributes are accessors and as such explained in the
1055 manpages. The uppercase attribute C<Producers> contains version
1056 information about involved software components. Nothing to worry about
1057 as I believe.
1059 =head2 THE RECENT PART
1061 This is the interesting part. Every entry refers to some filesystem
1062 change (with path, epoch, type).
1064 The I<epoch> value is the point in time when some change was
1065 I<registered> but can be set to arbitrary values. Do not be tempted to
1066 believe that the entry has a direct relation to something like
1067 modification time or change time on the filesystem level. They are not
1068 reflecting release dates. (If you want exact release dates: Barbie is
1069 providing a database of them. See
1070 http://use.perl.org/~barbie/journal/37907).
1072 All these entries can be devided into two types (denoted by the
1073 I<type> attribute): C<new>s and C<delete>s. Changes and creations are
1074 C<new>s. Deletes are C<delete>s.
1076 Besides an I<epoch> and a I<type> attribute we find a third one:
1077 I<path>. This path is relative to the directory we find the
1078 I<recentfile> in.
1080 The order of the entries in the I<recentfile> is by decreasing epoch
1081 attribute. These are unique floating point numbers. When the server
1082 has ntp running correctly, then the timestamps are usually reflecting
1083 a real epoch. If time is running backwards, we trump the system epoch
1084 with strictly monotonically increasing floating point timestamps and
1085 guarantee they are unique.
1087 =head1 CORRUPTION AND RECOVERY
1089 If the origin host breaks the promise to deliver consistent and
1090 complete I<recentfiles> then it must update its C<dirtymark> and all
1091 slaves must discard what they cosider the truth. In the worst case
1092 that something goes wrong despite the dirtymark mechanism the way back
1093 to sanity can always be achieved through traditional rsyncing between
1094 the hosts.
1096 =head1 BACKGROUND
1098 This is about speeding up rsync operation on large trees. Uses a small
1099 metadata cocktail and pull technology.
1101 rersyncrecent solves this problem with a couple of (usually 2-10)
1102 lightweight index files which cover different overlapping time
1103 intervals. The master writes these files and the clients/slaves can
1104 construct the full tree from the information contained in them. The
1105 most recent index file usually covers the last seconds or minutes or
1106 hours of the tree and depending on the needs, slaves can rsync every
1107 few seconds or minutes and then bring their trees in full sync.
1109 The rersyncrecent model was developed for CPAN but as it is both
1110 convenient and economic it is also a general purpose solution. I'm
1111 looking forward to see a CPAN backbone that is only a few seconds
1112 behind PAUSE.
1114 =head2 NON-COMPETITORS
1116 File::Mirror JWU/File-Mirror/File-Mirror-0.10.tar.gz only local trees
1117 Mirror::YAML ADAMK/Mirror-YAML-0.03.tar.gz some sort of inner circle
1118 Net::DownloadMirror KNORR/Net-DownloadMirror-0.04.tar.gz FTP sites and stuff
1119 Net::MirrorDir KNORR/Net-MirrorDir-0.05.tar.gz dito
1120 Net::UploadMirror KNORR/Net-UploadMirror-0.06.tar.gz dito
1121 Pushmi::Mirror CLKAO/Pushmi-v1.0.0.tar.gz something SVK
1123 rsnapshot www.rsnapshot.org focus on backup
1124 csync www.csync.org more like unison
1125 multi-rsync sourceforge 167893 lan push to many
1126 chasm chasmd.org per-directory manifests
1128 =head2 COMPETITORS
1130 The problem to solve which clusters and ftp mirrors and otherwise
1131 replicated datasets like CPAN share: how to transfer only a minimum
1132 amount of data to determine the diff between two hosts.
1134 Normally it takes a long time to determine the diff itself before it
1135 can be transferred. Known solutions at the time of this writing are
1136 csync2, and rsync 3 batch mode.
1138 For many years the best solution was B<csync2> which solves the
1139 problem by maintaining a sqlite database on both ends and talking a
1140 highly sophisticated protocol to quickly determine which files to send
1141 and which to delete at any given point in time. Csync2 is often
1142 inconvenient because it is push technology and the act of syncing
1143 demands quite an intimate relationship between the sender and the
1144 receiver. This is hard to achieve in an environment of loosely coupled
1145 sites where the number of sites is large or connections are unreliable
1146 or network topology is changing.
1148 B<Rsync 3 batch mode> works around these problems by providing
1149 rsync-able batch files which allow receiving nodes to replay the
1150 history of the other nodes. This reduces the need to have an
1151 incestuous relation but it has the disadvantage that these batch files
1152 replicate the contents of the involved files. This seems inappropriate
1153 when the nodes already have a means of communicating over rsync.
1155 =head2 HONORABLE MENTION
1157 B<instantmirror> at https://fedorahosted.org/InstantMirror/ is an
1158 ambitious project that tries to combine various technologies (squid,
1159 bittorrent) to overcome the current slowness with the main focus on
1160 fedora. It's been founded in 2009-03 and at the time of this writing
1161 it is still a bit early to comment on.
1163 =head1 LIMITATIONS
1165 If the tree of the master server is changing faster than the bandwidth
1166 permits to mirror then additional protocols may need to be deployed.
1167 Certainly p2p/bittorrent can help in such situations because
1168 downloading sites help each other and bittorrent chunks large files
1169 into pieces.
1171 =head1 FUTURE DIRECTIONS
1173 Currently the origin server must keep track of injected and removed
1174 files. Should be supported by an inotify-based assistant.
1176 Convince other users outside the CPAN like
1177 http://fedoraproject.org/wiki/Infrastructure/Mirroring
1179 =head1 SEE ALSO
1181 L<File::Rsync::Mirror::Recentfile>,
1182 L<File::Rsync::Mirror::Recentfile::Done>,
1183 L<File::Rsync::Mirror::Recentfile::FakeBigFloat>
1185 =head1 BUGS
1187 Please report any bugs or feature requests through the web interface
1189 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=File-Rsync-Mirror-Recent>.
1190 I will be notified, and then you'll automatically be notified of
1191 progress on your bug as I make changes.
1193 =head1 SUPPORT
1195 You can find documentation for this module with the perldoc command.
1197 perldoc File::Rsync::Mirror::Recent
1199 You can also look for information at:
1201 =over 4
1203 =item * RT: CPAN's request tracker
1205 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=File-Rsync-Mirror-Recent>
1207 =item * AnnoCPAN: Annotated CPAN documentation
1209 L<http://annocpan.org/dist/File-Rsync-Mirror-Recent>
1211 =item * CPAN Ratings
1213 L<http://cpanratings.perl.org/d/File-Rsync-Mirror-Recent>
1215 =item * Search CPAN
1217 L<http://search.cpan.org/dist/File-Rsync-Mirror-Recent>
1219 =back
1222 =head1 ACKNOWLEDGEMENTS
1224 Thanks to RJBS for module-starter.
1226 =head1 AUTHOR
1228 Andreas König
1230 =head1 COPYRIGHT & LICENSE
1232 Copyright 2008, 2009 Andreas König.
1234 This program is free software; you can redistribute it and/or modify it
1235 under the same terms as Perl itself.
1238 =cut
1240 1; # End of File::Rsync::Mirror::Recent