releasing the trial release as stable
[rersyncrecent.git] / lib / File / Rsync / Mirror / Recent.pm
blob9d802a7d81f56cdb1235450a694e290998a74817
1 package File::Rsync::Mirror::Recent;
3 # use warnings;
4 use strict;
5 use File::Rsync::Mirror::Recentfile;
7 =encoding utf-8
9 =head1 NAME
11 File::Rsync::Mirror::Recent - mirroring via rsync made efficient
13 =cut
15 package File::Rsync::Mirror::Recent;
17 use File::Basename qw(basename dirname fileparse);
18 use File::Copy qw(cp);
19 use File::Path qw(mkpath);
20 use File::Rsync;
21 use File::Rsync::Mirror::Recentfile::Done (); # at least needed by thaw()
22 use File::Rsync::Mirror::Recentfile::FakeBigFloat qw(:all);
23 use File::Temp;
24 use List::Pairwise qw(mapp grepp);
25 use List::Util qw(first max);
26 use Scalar::Util qw(blessed reftype);
27 use Storable;
28 use Time::HiRes qw();
29 use YAML::Syck;
31 use version; our $VERSION = qv('0.4.6');
33 =head1 SYNOPSIS
35 The documentation in here is normally not needed because the code is
36 considered to be run from several standalone programs. For a quick
37 overview, see the file README.mirrorcpan and the bin/ directory of the
38 distribution. For the architectural ideas see the section THE
39 ARCHITECTURE OF A COLLECTION OF RECENTFILES below.
41 File::Rsync::Mirror::Recent establishes a view on a collection of
42 File::Rsync::Mirror::Recentfile objects and provides abstractions
43 spanning multiple time intervals associated with those.
45 =head1 EXPORT
47 No exports.
49 =head1 CONSTRUCTORS
51 =head2 my $obj = CLASS->new(%hash)
53 Constructor. On every argument pair the key is a method name and the
54 value is an argument to that method name.
56 =cut
58 sub new {
59 my($class, @args) = @_;
60 my $self = bless {}, $class;
61 while (@args) {
62 my($method,$arg) = splice @args, 0, 2;
63 $self->$method($arg);
65 return $self;
68 =head2 my $obj = CLASS->thaw($statusfile)
70 Constructor from a statusfile left over from a previous
71 rmirror run. See also C<runstatusfile>.
73 =cut
75 sub _thaw_without_pathdb {
76 my($self,$file) = @_;
77 open my $fh, $file or die "Can't open '$file': $!";
78 local $/ = "\n";
79 my $in_pathdb = 0;
80 my $tfile = File::Temp->new
82 TEMPLATE => "Recent-thaw-XXXX",
83 TMPDIR => 1,
84 UNLINK => 0,
85 CLEANUP => 0,
86 SUFFIX => '.dat',
88 my $template_for_eop;
89 while (<$fh>) {
90 if ($in_pathdb) {
91 if (/$template_for_eop/) {
92 $in_pathdb = 0;
94 } elsif (/(\s+)-\s*__pathdb\s*:/) {
95 $in_pathdb = 1;
96 my $next_attr = sprintf "^%s\\S", " ?" x length($1);
97 $template_for_eop = qr{$next_attr};
99 print $tfile $_ unless $in_pathdb;
101 close $tfile or die "Could not close: $!";
102 my $return = $self->thaw($tfile->filename);
103 $return->_havelostpathdb(1);
104 unlink $tfile->filename;
105 return $return;
107 sub thaw {
108 my($self, $file) = @_;
109 die "thaw called without statusfile argument" unless defined $file;
110 unless (-e $file){
111 require Carp;
112 Carp::confess("Alert: statusfile '$file' not found");
114 require YAML::Syck;
115 my $start = time;
116 my $sleeptime = 0.02;
117 while (not mkdir "$file.lock") {
118 my $err = $!;
119 Time::HiRes::sleep $sleeptime;
120 my $waiting = time - $start;
121 if ($waiting >= 3){
122 warn "*** waiting ($waiting) for lock ($err) ***";
123 $sleeptime = 1;
126 my $size = -s $file;
127 my $serialized = YAML::Syck::LoadFile($file);
128 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
129 my $charged_self = $serialized->{reduced_self};
130 my $class = blessed $self;
131 bless $charged_self, $class;
132 my $rfs = $serialized->{reduced_rfs};
133 my $rfclass = $class . "file"; # "Recent" . "file"
134 my $pathdb = $charged_self->_pathdb;
135 for my $rf (@$rfs) {
136 bless $rf, $rfclass;
137 $rf->_pathdb($pathdb);
139 $charged_self->_recentfiles($rfs);
140 $charged_self->_principal_recentfile($rfs->[0]);
141 # die "FIXME: thaw all recentfiles from reduced_rfs into _recentfiles as well, watch out for pathdb and rsync";
142 return $charged_self;
145 =head1 ACCESSORS
147 =cut
149 my @accessors;
151 BEGIN {
152 @accessors =
154 "__pathdb",
155 "_dirtymark", # keeps track of the dirtymark of the recentfiles
156 "_havelostpathdb", # boolean
157 "_have_written_statusfile", # boolean
158 "_logfilefordone", # turns on _logfile on all DONE
159 # systems (disk intensive)
160 "_max_one_state", # when we have no time left but want
161 # at least get one file per
162 # iteration to avoid procrastination
163 "_principal_recentfile",
164 "_recentfiles",
165 "_rsync",
166 "_runstatusfile", # occasionally dumps all rfs
167 "_verbose", # internal variable for verbose setter/getter
168 "_verboselog", # internal variable for verboselog setter/getter
171 my @pod_lines =
172 split /\n/, <<'=cut'; push @accessors, grep {s/^=item\s+//} @pod_lines; }
174 =over 4
176 =item ignore_link_stat_errors
178 as in F:R:M:Recentfile
180 =item local
182 Option to specify the local principal file for operations with a local
183 collection of recentfiles.
185 =item localroot
187 as in F:R:M:Recentfile
189 =item max_files_per_connection
191 as in F:R:M:Recentfile
193 =item remote
195 The remote principal recentfile in rsync notation. E.g.
197 pause.perl.org::authors/RECENT.recent
199 =item remoteroot
201 as in F:R:M:Recentfile
203 =item remote_recentfile
205 Rsync address of the remote C<RECENT.recent> symlink or whichever name
206 the principal remote recentfile has.
208 =item rsync_options
210 Things like compress, links, times or checksums. Passed in to the
211 File::Rsync object used to run the mirror. Can be a hashref or an
212 arrayref. Depending on the version of File::Rsync it is passed on as a
213 hashref or as a flat list.
215 =item tempdir
217 as in F:R:M:Recentfile
219 =item ttl
221 Minimum time before fetching the principal recentfile again.
223 =back
225 =cut
227 use accessors @accessors;
229 =head1 METHODS
231 =head2 $arrayref = $obj->news ( %options )
233 Test this with:
235 perl -Ilib bin/rrr-news \
236 -after 1217200539 \
237 -max 12 \
238 -local /home/ftp/pub/PAUSE/authors/RECENT.recent
240 perl -Ilib bin/rrr-news \
241 -after 1217200539 \
242 -rsync=compress=1 \
243 -rsync=links=1 \
244 -localroot /home/ftp/pub/PAUSE/authors/ \
245 -remote pause.perl.org::authors/RECENT.recent
246 -verbose
248 All parameters that can be passed to
249 File:Rsync:Mirror:Recentfile::recent_events() can also be specified
250 here.
252 One additional option is supported. If C<$Options{callback}> is
253 specified, it must be a subref. This sub is called whenever one chunk
254 of events is found. The first argument to the callback is a reference
255 to the currently accumulated array of events.
257 Note: all data are kept in memory.
259 =cut
261 sub news {
262 my($self, %opt) = @_;
263 my $local = $self->local;
264 unless ($local) {
265 if (my $remote = $self->remote) {
266 my $localroot;
267 if ($localroot = $self->localroot) {
268 # nice, they know what they are doing
269 } else {
270 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
272 } else {
273 die "Alert: neither local nor remote specified, cannot continue";
276 my $rfs = $self->recentfiles;
277 my $ret = [];
278 my $before;
279 for my $rf (@$rfs) {
280 my %locopt = %opt;
281 $locopt{before} = $before;
282 if ($opt{max}) {
283 $locopt{max} -= scalar @$ret;
284 last if $locopt{max} <= 0;
286 $locopt{info} = {};
287 my $res = $rf->recent_events(%locopt);
288 if (@$res){
289 push @$ret, @$res;
291 if ($opt{max} && scalar @$ret > $opt{max}) {
292 last;
294 if ($opt{after}){
295 if ( $locopt{info}{last} && _bigfloatlt($locopt{info}{last}{epoch},$opt{after}) ) {
296 last;
298 if ( _bigfloatgt($opt{after},$locopt{info}{first}{epoch}) ) {
299 last;
302 if (!@$res){
303 next;
305 $before = $res->[-1]{epoch};
306 $before = $opt{before} if $opt{before} && _bigfloatlt($opt{before},$before);
307 if (my $sub = $opt{callback}) {
308 $sub->($ret);
311 $ret;
314 =head2 overview ( %options )
316 returns a small table that summarizes the state of all recentfiles
317 collected in this Recent object.
319 $options{verbose}=1 increases the number of columns displayed.
321 Here is an example output:
323 Ival Cnt Max Min Span Util Cloud
324 1h 47 1225053014.38 1225049650.91 3363.47 93.4% ^ ^
325 6h 324 1225052939.66 1225033394.84 19544.82 90.5% ^ ^
326 1d 437 1225049651.53 1224966402.53 83248.99 96.4% ^ ^
327 1W 1585 1225039015.75 1224435339.46 603676.29 99.8% ^ ^
328 1M 5855 1225017376.65 1222428503.57 2588873.08 99.9% ^ ^
329 1Q 17066 1224578930.40 1216803512.90 7775417.50 100.0% ^ ^
330 1Y 15901 1223966162.56 1216766820.67 7199341.89 22.8% ^ ^
331 Z 9909 1223966162.56 1216766820.67 7199341.89 - ^ ^
333 I<Max> is the name of the interval.
335 I<Cnt> is the number of entries in this recentfile.
337 I<Max> is the highest(first) epoch in this recentfile, rounded.
339 I<Min> is the lowest(last) epoch in this recentfile, rounded.
341 I<Span> is the timespan currently covered, rounded.
343 I<Util> is I<Span> devided by the designated timespan of this
344 recentfile.
346 I<Cloud> is ascii art illustrating the sequence of the Max and Min
347 timestamps.
349 =cut
350 sub overview {
351 my($self,%options) = @_;
352 my $rfs = $self->recentfiles;
353 my(@s,%rank);
354 RECENTFILE: for my $rf (@$rfs) {
355 my $re=$rf->recent_events;
356 my $rfsummary;
357 if (@$re) {
358 my $span = $re->[0]{epoch}-$re->[-1]{epoch};
359 my $merged = $rf->merged;
360 $rfsummary =
362 "Ival",
363 $rf->interval,
364 "Cnt",
365 scalar @$re,
366 "Dirtymark",
367 $rf->dirtymark ? sprintf("%.2f",$rf->dirtymark) : "-",
368 "Produced",
369 sprintf ("%.2f", $rf->{ORIG}{Producers}{time}||0),
370 "Merged",
371 ($rf->interval eq "Z"
375 sprintf ("%.2f", $merged->{epoch} || 0)),
376 "Max",
377 sprintf ("%.2f", $re->[0]{epoch}),
378 "Min",
379 sprintf ("%.2f", $re->[-1]{epoch}),
380 "Span",
381 sprintf ("%.2f", $span),
382 "Util", # u9n:)
383 ($rf->interval eq "Z"
387 sprintf ("%5.1f%%", 100 * $span / $rf->interval_secs)
390 @rank{mapp {$b} grepp {$a =~ /^(Max|Min)$/} @$rfsummary} = ();
391 } else {
392 next RECENTFILE;
394 push @s, $rfsummary;
396 @rank{sort {$b <=> $a} keys %rank} = 1..keys %rank;
397 my $maxrank = max values %rank;
398 for my $rfsummary (@s) {
399 my $string = " " x $maxrank;
400 my @borders;
401 for my $ele (qw(Max Min)) {
402 my($r) = mapp {$b} grepp {$a eq $ele} @$rfsummary;
403 push @borders, $rank{$r}-1;
405 for ($borders[0],$borders[1]) {
406 substr($string,$_,1) = "^";
408 push @$rfsummary, "Cloud", $string;
410 unless ($options{verbose}) {
411 my %filter = map {($_=>1)} qw(Ival Cnt Max Min Span Util Cloud);
412 for (@s) {
413 $_ = [mapp {($a,$b)} grepp {!!$filter{$a}} @$_];
416 my @sprintf;
417 for (my $i = 0; $i <= $#{$s[0]}; $i+=2) {
418 my $maxlength = max ((map { length $_->[$i+1] } @s), length $s[0][$i]);
419 push @sprintf, "%" . $maxlength . "s";
421 my $sprintf = join " ", @sprintf;
422 $sprintf .= "\n";
423 my $headline = sprintf $sprintf, mapp {$a} @{$s[0]};
424 join "", $headline, map { sprintf $sprintf, mapp {$b} @$_ } @s;
427 =head2 _pathdb
429 Keeping track of already handled files. Currently it is a hash, will
430 probably become a database with its own accessors.
432 =cut
434 sub _pathdb {
435 my($self, $set) = @_;
436 if ($set) {
437 $self->__pathdb ($set);
439 my $pathdb = $self->__pathdb;
440 unless (defined $pathdb) {
441 $self->__pathdb(+{});
443 return $self->__pathdb;
446 =head2 $recentfile = $obj->principal_recentfile ()
448 returns the principal recentfile object of this tree.
450 =cut
451 # mirrors the recentfile and instantiates the recentfile object
452 sub _principal_recentfile_fromremote {
453 my($self) = @_;
454 # get the remote recentfile
455 my $rrfile = $self->remote or die "Alert: cannot construct a recentfile object without the 'remote' attribute";
456 my $splitter = qr{(.+)/([^/]*)};
457 my($remoteroot,$rfilename) = $rrfile =~ $splitter;
458 $self->remoteroot($remoteroot);
459 my($abslfile, $fh);
460 if (!defined $rfilename) {
461 die "Alert: Cannot resolve '$rrfile', does not match $splitter";
462 } elsif (not length $rfilename or $rfilename eq "RECENT.recent") {
463 ($abslfile,$rfilename,$fh) = $self->_principal_recentfile_fromremote_resosymlink($rfilename);
465 my @need_args =
467 "ignore_link_stat_errors",
468 "localroot",
469 "max_files_per_connection",
470 "remoteroot",
471 "rsync_options",
472 "tempdir",
473 "ttl",
474 "verbose",
475 "verboselog",
477 my $rf0;
478 unless ($abslfile) {
479 $rf0 = File::Rsync::Mirror::Recentfile->new (map {($_ => $self->$_)} @need_args);
480 $rf0->split_rfilename($rfilename);
481 $abslfile = $rf0->get_remote_recentfile_as_tempfile ();
483 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ( $abslfile );
484 $rf0->_current_tempfile ( $abslfile );
485 $rf0->_current_tempfile_fh ( $fh );
486 $rf0->_use_tempfile (1);
487 for my $override (@need_args) {
488 $rf0->$override ( $self->$override );
490 $rf0->is_slave (1);
491 return $rf0;
493 sub principal_recentfile {
494 my($self) = @_;
495 my $rf0 = $self->_principal_recentfile;
496 return $rf0 if defined $rf0;
497 my $local = $self->local;
498 if ($local) {
499 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ($local);
500 } else {
501 if (my $remote = $self->remote) {
502 my $localroot;
503 if ($localroot = $self->localroot) {
504 # nice, they know what they are doing
505 } else {
506 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
508 $rf0 = $self->_principal_recentfile_fromremote;
509 } else {
510 die "Alert: neither local nor remote specified, cannot continue";
513 $self->_principal_recentfile($rf0);
514 return $rf0;
517 =head2 $recentfiles_arrayref = $obj->recentfiles ()
519 returns a reference to the complete list of recentfile objects that
520 describe this tree. No guarantee is given that the represented
521 recentfiles exist or have been read. They are just bare objects.
523 =cut
525 sub recentfiles {
526 my($self) = @_;
527 my $rfs = $self->_recentfiles;
528 return $rfs if defined $rfs;
529 my $rf0 = $self->principal_recentfile;
530 my $pathdb = $self->_pathdb;
531 $rf0->_pathdb ($pathdb);
532 my $aggregator = $rf0->aggregator;
533 my @rf = $rf0;
534 for my $agg (@$aggregator) {
535 my $nrf = $rf0->_sparse_clone;
536 $nrf->interval ( $agg );
537 $nrf->have_mirrored ( 0 );
538 $nrf->_pathdb ( $pathdb );
539 push @rf, $nrf;
541 $self->_recentfiles(\@rf);
542 return \@rf;
545 =head2 $success = $obj->rmirror ( %options )
547 Mirrors all recentfiles of the I<remote> address working through all
548 of them, mirroring their contents.
550 Test this with:
552 use File::Rsync::Mirror::Recent;
553 my $rrr = File::Rsync::Mirror::Recent->new(
554 ignore_link_stat_errors => 1,
555 localroot => "/home/ftp/pub/PAUSE/authors",
556 remote => "pause.perl.org::authors/RECENT.recent",
557 max_files_per_connection => 5000,
558 rsync_options => {
559 compress => 1,
560 links => 1,
561 times => 1,
562 checksum => 0,
564 verbose => 1,
565 _runstatusfile => "recent-rmirror-state.yml",
566 _logfilefordone => "recent-rmirror-donelog.log",
568 $rrr->rmirror ( "skip-deletes" => 1, loop => 1 );
570 Or try without the loop parameter and write the loop yourself:
572 use File::Rsync::Mirror::Recent;
573 my @rrr;
574 for my $t ("authors","modules"){
575 my $rrr = File::Rsync::Mirror::Recent->new(
576 ignore_link_stat_errors => 1,
577 localroot => "/home/ftp/pub/PAUSE/$t",
578 remote => "pause.perl.org::$t/RECENT.recent",
579 max_files_per_connection => 512,
580 rsync_options => {
581 compress => 1,
582 links => 1,
583 times => 1,
584 checksum => 0,
586 verbose => 1,
587 _runstatusfile => "recent-rmirror-state-$t.yml",
588 _logfilefordone => "recent-rmirror-donelog-$t.log",
589 ttl => 5,
591 push @rrr, $rrr;
593 while (){
594 for my $rrr (@rrr){
595 $rrr->rmirror ( "skip-deletes" => 1 );
597 warn "sleeping 23\n"; sleep 23;
601 =cut
602 # _alluptodate is unused but at least it worked last time I needed it,
603 # so let us keep it around
604 sub _alluptodate {
605 my($self) = @_;
606 my $sdm = $self->_dirtymark;
607 return unless defined $sdm;
608 for my $rf (@{$self->recentfiles}) {
609 return if $rf->seeded;
610 my $rfdm = $rf->dirtymark;
611 return unless defined $rfdm;
612 return unless $rfdm eq $sdm;
613 my $done = $rf->done;
614 return unless defined $done;
615 my $done_intervals = $done->_intervals;
616 return if !defined $done_intervals;
617 # nonono, may be more than one, only covered it must be:
618 # return if @$done_intervals > 1;
619 my $minmax = $rf->minmax;
620 return unless defined $minmax;
621 return unless $done->covered(@$minmax{qw(max min)});
623 # $DB::single++;
624 return 1;
626 sub _fullseed {
627 my($self) = @_;
628 for ( @{$self->recentfiles} ) { $_->seed(1) }
630 sub rmirror {
631 my($self, %options) = @_;
633 my $rfs = $self->recentfiles;
635 $self->principal_recentfile->seed;
636 my $_sigint = sub {
637 # XXX exit gracefully (reminder)
640 # XXX needs accessor: warning, if set too low, we do nothing but
641 # mirror the principal!
642 my $minimum_time_per_loop = 20;
644 if (my $logfile = $self->_logfilefordone) {
645 for my $i (0..$#$rfs) {
646 $rfs->[$i]->done->_logfile($logfile);
649 if (my $dirtymark = $self->principal_recentfile->dirtymark) {
650 my $mydm = $self->_dirtymark;
651 if (!defined $mydm){
652 $self->_dirtymark($dirtymark);
653 } elsif ($dirtymark ne $mydm) {
654 if ($self->verbose) {
655 my $fh;
656 if (my $vl = $self->verboselog) {
657 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
658 } else {
659 $fh = \*STDERR;
661 print $fh "NewDirtymark: old[$mydm] new[$dirtymark]\n";
663 $self->_dirtymark($dirtymark);
666 my $rstfile = $self->runstatusfile;
667 unless ($self->_have_written_statusfile) {
668 $self->_rmirror_runstatusfile_write ($rstfile, \%options);
669 $self->_have_written_statusfile(1);
671 $self->_rmirror_loop($minimum_time_per_loop,\%options);
674 sub _rmirror_loop {
675 my($self,$minimum_time_per_loop,$options) = @_;
676 LOOP: while () {
677 my $ttleave = time + $minimum_time_per_loop;
678 my $rstfile = $self->runstatusfile;
679 my $otherproc = $self->_thaw_without_pathdb ($rstfile);
680 my $pid = fork;
681 if (! defined $pid) {
682 warn "Contention: $!";
683 sleep 0.25;
684 next LOOP;
685 } elsif ($pid) {
686 waitpid($pid,0);
687 } else {
688 $self = $self->thaw ($rstfile);
689 my $rfs = $self->recentfiles;
690 $self->principal_recentfile->seed;
691 RECENTFILE: for my $i (0..$#$rfs) {
692 my $rf = $rfs->[$i];
693 if (time > $ttleave) {
694 # Must make sure that one file can get fetched in any case
695 $self->_max_one_state(1);
697 if ($rf->seeded) {
698 $self->_rmirror_mirror ($i, $options);
699 } elsif ($rf->uptodate) {
700 if ($i < $#$rfs) {
701 $rfs->[$i+1]->done->merge($rf->done);
703 # no further seed necessary because "periodic" does it
704 next RECENTFILE;
706 WORKUNIT: while (time < $ttleave) {
707 if ($rf->uptodate) {
708 $self->_rmirror_sleep_per_connection ($i);
709 next RECENTFILE;
710 } else {
711 $self->_rmirror_mirror ($i, $options);
714 if ($self->_max_one_state) {
715 last RECENTFILE;
718 $self->_max_one_state(0);
719 my $exit = 0;
720 if ($rfs->[-1]->uptodate) {
721 $self->_rmirror_cleanup;
723 unless ($options->{loop}) {
724 $exit = 1;
726 $self->_rmirror_runstatusfile_write ($rstfile, $options);
727 exit if $exit;
728 last LOOP;
731 $otherproc = $self->_thaw_without_pathdb ($rstfile);
732 if (!$options->{loop} && $otherproc && $otherproc->recentfiles->[-1]->uptodate) {
733 last LOOP;
735 my $sleep = $ttleave - time;
736 if ($sleep > 0.01) {
737 $self->_rmirror_endofloop_sleep ($sleep);
738 } else {
739 # negative time not invented yet:)
744 sub _rmirror_mirror {
745 my($self, $i, $options) = @_;
746 my $rfs = $self->recentfiles;
747 my $rf = $rfs->[$i];
748 my %locopt = %$options;
749 if ($self->_max_one_state) {
750 $locopt{max} = 1;
752 $locopt{piecemeal} = 1;
753 $rf->mirror (%locopt);
754 if ($i==0) {
755 # we limit to 0 for the case that upstream is broken and has
756 # more than one timestamp (happened on PAUSE 200903)
757 if (my $dirtymark = $rf->dirtymark) {
758 my $mydm = $self->_dirtymark;
759 if (!defined $mydm or $dirtymark ne $mydm) {
760 $self->_dirtymark($dirtymark);
761 $self->_fullseed;
767 sub _rmirror_sleep_per_connection {
768 my($self, $i) = @_;
769 my $rfs = $self->recentfiles;
770 my $rf = $rfs->[$i];
771 my $sleep = $rf->sleep_per_connection;
772 $sleep = 0.42 unless defined $sleep;
773 Time::HiRes::sleep $sleep;
774 $rfs->[$i+1]->done->merge($rf->done) if $i < $#$rfs;
777 sub _rmirror_cleanup {
778 my($self) = @_;
779 my $pathdb = $self->_pathdb();
780 for my $k (keys %$pathdb) {
781 delete $pathdb->{$k};
783 my $rfs = $self->recentfiles;
784 for my $i (0..$#$rfs-1) {
785 my $thismerged = $rfs->[$i]->merged;
786 my $next = $rfs->[$i+1];
787 my $nextminmax = $next->minmax;
788 if (not defined $thismerged->{epoch} or _bigfloatlt($nextminmax->{max},$thismerged->{epoch})){
789 $next->seed;
794 =head2 $file = $obj->runstatusfile ($set)
796 Getter/setter for C<_runstatusfile> attribute. Defaults to a temporary
797 file created by C<File::Temp>. A status file is required for
798 C<rmirror> working. Since it may be interesting for debugging
799 purposes, you may want to specify a permanent file for this.
801 =cut
802 sub runstatusfile {
803 my($self,$set) = @_;
804 if (defined $set) {
805 $self->_runstatusfile ($set);
807 my $x = $self->_runstatusfile;
808 unless (defined $x) {
809 require File::Temp;
810 my $tfile = File::Temp->new
812 TEMPLATE => "Recent-XXXX",
813 TMPDIR => 1,
814 UNLINK => 0,
815 CLEANUP => 0,
816 SUFFIX => '.dat',
818 $self->_runstatusfile($tfile->filename);
820 return $self->_runstatusfile;
823 # unused code.... it was an oops, discovered the thaw() method too
824 # late, and starting writing this here....
825 sub _rmirror_runstatusfile_read {
826 my($self, $file) = @_;
828 require YAML::Syck;
829 my $start = time;
830 # XXX is locking useful here?
831 while (not mkdir "$file.lock") {
832 Time::HiRes::sleep 0.2;
833 warn "*** waiting for lock ***" if time - $start >= 3;
835 my $yml = YAML::Syck::LoadFile $file;
836 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
837 my $rself = $yml->{reduced_self};
838 my $rfs = $yml->{reduced_rfs};
839 # XXX bring them into self
842 sub _rmirror_runstatusfile_write {
843 my($self, $file, $options) = @_;
844 my $rself;
845 while (my($k,$v) = each %$self) {
846 next if $k =~ /^-(_principal_recentfile|_recentfiles)$/;
847 $rself->{$k} = $v;
849 my $rfs = $self->recentfiles;
850 my $rrfs;
851 for my $i (0..$#$rfs) {
852 my $rf = $rfs->[$i];
853 while (my($k,$v) = each %$rf) {
854 next if $k =~ /^-(_current_tempfile_fh|_pathdb|_rsync)$/;
855 $rrfs->[$i]{$k} = $rfs->[$i]{$k};
858 require YAML::Syck;
859 my $start = time;
860 while (not mkdir "$file.lock") {
861 Time::HiRes::sleep 0.15;
862 warn "*** waiting for lock directory '$file.lock' ***" if time - $start >= 3;
864 YAML::Syck::DumpFile
866 "$file.new",
868 options => $options,
869 time => time,
870 reduced_rfs => $rrfs,
871 reduced_self => $rself,
873 rename "$file.new", $file or die "Could not rename: $!";
874 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
877 sub _rmirror_endofloop_sleep {
878 my($self, $sleep) = @_;
879 if ($self->verbose) {
880 my $fh;
881 if (my $vl = $self->verboselog) {
882 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
883 } else {
884 $fh = \*STDERR;
886 printf $fh
888 "Dorm %d (%s secs)\n",
889 time,
890 $sleep,
893 sleep $sleep;
896 # it returns two things: abslfile and rfilename. But the abslfile is
897 # undef when the rfilename ends in .recent. A weird interface, my
898 # friend.
899 sub _principal_recentfile_fromremote_resosymlink {
900 my($self, $rfilename) = @_;
901 $rfilename = "RECENT.recent" unless length $rfilename;
902 my $abslfile = undef;
903 my $fh;
904 if ($rfilename =~ /\.recent$/) {
905 # may be a file *or* a symlink,
906 ($abslfile,$fh) = $self->_fetch_as_tempfile ($rfilename);
907 while (-l $abslfile) {
908 my $symlink = readlink $abslfile;
909 if ($symlink =~ m|/|) {
910 die "FIXME: filenames containing '/' not supported, got '$symlink'";
912 my $localrfile = File::Spec->catfile($self->localroot, $rfilename);
913 if (-e $localrfile) {
914 my $old_symlink = readlink $localrfile;
915 if ($old_symlink eq $symlink) {
916 unlink $abslfile or die "Cannot unlink '$abslfile': $!";
917 } else {
918 unlink $localrfile; # may fail
919 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
921 } else {
922 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
924 ($abslfile,$fh) = $self->_fetch_as_tempfile ($symlink);
927 return ($abslfile, $rfilename, $fh);
930 # takes a basename, returns an absolute name, does not delete the
931 # file, throws the $fh away. Caller must rename or unlink
933 # XXX needs to activate the fh in the rf0 so that it is able to unlink
934 # the file. I would like that the file is used immediately by $rf0
935 sub _fetch_as_tempfile {
936 my($self, $rfile) = @_;
937 my($suffix) = $rfile =~ /(\.[^\.]+)$/;
938 $suffix = "" unless defined $suffix;
939 my $fh = File::Temp->new
940 (TEMPLATE => sprintf(".FRMRecent-%s-XXXX",
941 $rfile,
943 DIR => $self->tempdir || $self->localroot,
944 SUFFIX => $suffix,
945 UNLINK => 0,
947 my $rsync;
948 my @rsync_options;
949 if (my $rso = $self->rsync_options) {
950 if (ref $rso eq "HASH") {
951 @rsync_options = %$rso;
952 } elsif (ref $rso eq "ARRAY") {
953 @rsync_options = @$rso;
955 } else {
956 @rsync_options = ();
958 if ($File::Rsync::VERSION <= 0.45) {
959 $rsync = File::Rsync->new({@rsync_options});
960 } else {
961 $rsync = File::Rsync->new(@rsync_options);
963 unless ($rsync) {
964 require Carp;
965 Carp::confess(YAML::Syck::Dump($self->rsync_options));
967 my $dst = $fh->filename;
968 local($ENV{LANG}) = "C";
969 $rsync->exec
971 src => join("/",$self->remoteroot,$rfile),
972 dst => $dst,
973 ) or die "Could not mirror '$rfile' to $fh\: ".join(" ",$rsync->err);
974 unless (-l $dst) {
975 my $mode = 0644;
976 chmod $mode, $dst or die "Could not chmod $mode '$dst': $!";
978 return($dst,$fh);
981 =head2 $verbose = $obj->verbose ( $set )
983 Getter/setter method to set verbosity for this F:R:M:Recent object and
984 all associated Recentfile objects.
986 =cut
987 sub verbose {
988 my($self,$set) = @_;
989 if (defined $set) {
990 for ( @{$self->recentfiles} ) { $_->verbose($set) }
991 $self->_verbose ($set);
993 my $x = $self->_verbose;
994 unless (defined $x) {
995 $x = 0;
996 $self->_verbose ($x);
998 return $x;
1002 =head2 my $vl = $obj->verboselog ( $set )
1004 Getter/setter method for the path to the logfile to write verbose
1005 progress information to.
1007 Note: This is a primitive stop gap solution to get simple verbose
1008 logging working. The program still sends error messages to STDERR.
1009 Switching to Log4perl or similar is probably the way to go. TBD.
1011 =cut
1012 sub verboselog {
1013 my($self,$set) = @_;
1014 if (defined $set) {
1015 for ( @{$self->recentfiles} ) { $_->verboselog($set) }
1016 $self->_verboselog ($set);
1018 my $x = $self->_verboselog;
1019 unless (defined $x) {
1020 $x = 0;
1021 $self->_verboselog ($x);
1023 return $x;
1026 =head1 THE ARCHITECTURE OF A COLLECTION OF RECENTFILES
1028 The idea is that we want to have a short file that records really
1029 recent changes. So that a fresh mirror can be kept fresh as long as
1030 the connectivity is given. Then we want longer files that record the
1031 history before. So when the mirror falls behind the update period
1032 reflected in the shortest file, it can complement the list of recent
1033 file events with the next one. And if this is not long enough we want
1034 another one, again a bit longer. And we want one that completes the
1035 history back to the oldest file. The index files together do contain
1036 the complete list of current files. The longer a period covered by an
1037 index file is gone the less often the index file is updated. For
1038 practical reasons adjacent files will often overlap a bit but this is
1039 neither necessary nor enforced. Enforced is only that there must not
1040 ever be a gap between two adjacent index files that would have to
1041 contain a file reference. That's the basic idea. The following example
1042 represents a tree that has a few updates every day:
1044 RECENT.recent -> RECENT-1h.yaml
1045 RECENT-1h.yaml
1046 RECENT-6h.yaml
1047 RECENT-1d.yaml
1048 RECENT-1M.yaml
1049 RECENT-1W.yaml
1050 RECENT-1Q.yaml
1051 RECENT-1Y.yaml
1052 RECENT-Z.yaml
1054 Each of these files represents a contract to hold a record for every
1055 filesystem event within the period indicated in the filename.
1057 The first file is the principal file, in so far it is the one that is
1058 written first after a filesystem change. Usually a symlink links to it
1059 with a filename that has the same filenameroot and the suffix
1060 C<.recent>. On systems that do not support symlinks there is a plain
1061 copy maintained instead.
1063 The last file, the Z file, contains the complementary files that are
1064 in none of the other files. It may contain C<delete> events but often
1065 C<delete> events are discarded at the transition to the Z file.
1067 =head2 SITE SEEING TOUR
1069 This section illustrates the operation of a server-client couple in a
1070 fictious installation that has to deal with a long time of inactivity.
1071 I think such an edge case installation demonstrates the economic
1072 behaviour of our model of overlapping time slices best.
1074 The sleeping beauty (http://en.wikipedia.org/wiki/Sleeping_Beauty) is
1075 a classic fairytale of a princess sleeping for a hundred years. The
1076 story inspired the test case 02-aurora.t.
1078 Given an upstream server where the people stop feeding new files for
1079 one hundred years. That upstream server has no driving energy to do
1080 major changes to its RECENT files. Cronjobs will continue to shift
1081 things towards the Z file but soon will stop doing so since all of
1082 them have to keep their promise to record files covering a certain
1083 period. Soon all RECENT files will cover exactly their native period.
1085 Downstream servers will stubbornly ask their question to the rsync
1086 server whether there is a newer RECENT.recent. As soon as the smallest
1087 RECENT file has reached the state of maximum possible merge with the
1088 second smallest RECENT file, the answer of the rsync server will
1089 always be: nothing new. And downstream servers that were uptodate on
1090 the previous request will be satisfied and do nothing. Never will they
1091 request a download. The answer that there is no change is sufficient
1092 to determine that there is no change in the whole tree.
1094 Let's presume the smallest RECENT file on this castle is a 1h file and
1095 downstream decides to ask every 30 minutes. Now the hundred years are
1096 over and upstream starts producing files again. One file every minute.
1097 After one minute it will move old files over to the, say, 1d file. In
1098 the next sixty minutes it will not be allowed to move any other file
1099 over to the 1d file. At some point in time downstream will ask the
1100 obligatory question "anything new?" and it will get the current 1h
1101 file. It will recognize in the meta part of the current file which
1102 timestamps have been moved to the 1d file, it will recognize that it
1103 has all those. It will have no need to download the 1d file, it will
1104 download the missing files and be done. No second RECENT file needs to
1105 be downloaded.
1107 Downstream only decides to download another RECENT file when not doing
1108 so would result in a gap between two recent files. Such that
1109 consistency checks would become impossible. Or for potentially
1110 interested third parties, like down-down-stream servers.
1112 Downloads of RECENT files are subject to rsync optimizations in that
1113 rsync does some level of blockwise checksumming that is considered
1114 efficient to avoid copying blocks of data that have not changed. Our
1115 format is that of an ordered array, so that large blocks stay constant
1116 when elements are prepended to the array. This means we usually do not
1117 have to rsync full RECENT files. Only if they are really small, the
1118 rsync algorithm will not come into play but that's OK for small files.
1120 Upstream servers are extremely lazy in writing the larger files. See
1121 File::Rsync::Mirror::Recentfile::aggregate() for the specs. Long
1122 before the one hundred years are over, the upstream server will stop
1123 changing files. Slowly everything that existed before upstream fell
1124 asleep trickles into the Z file. Say, the second-largest RECENT file
1125 is a 1Y file and the third-largest RECENT file is a 1Q file, then it
1126 will take at least one quarter of a year that the 1Y file will be
1127 merged into the Z file. From that point in time everything will have
1128 been merged into the Z file and the server's job to call C<aggregate>
1129 regularly will become a noop. Consequently downstream will never again
1130 download anything. Just the obligatory question: anything new?
1132 =head2 THE INDIVIDUAL RECENTFILE
1134 A I<recentfile> consists of a hash that has two keys: C<meta> and
1135 C<recent>. The C<meta> part has metadata and the C<recent> part has a
1136 list of fileobjects.
1138 =head2 THE META PART
1140 Here we find things that are pretty much self explaining: all
1141 lowercase attributes are accessors and as such explained in the
1142 manpages. The uppercase attribute C<Producers> contains version
1143 information about involved software components.
1145 Even though the lowercase attributes are documented in the
1146 F:R:M:Recentfile manpage, let's focus on the important stuff to make
1147 sure nothing goes by unnoticed: meta contains the aggregator levels in
1148 use in this installation, in other words the names of the RECENT
1149 files, eg:
1151 aggregator:
1152 - 3s
1153 - 8s
1154 - 21s
1155 - 55s
1158 It contains a dirtymark telling us the timestamp of the last protocol
1159 violation of the upstream server:
1161 dirtymark: '1325093856.49272'
1163 Plus a few things convenient in a situation where we need to do some
1164 debugging.
1166 And it contains information about which timestamp is the maximum
1167 timestamp in the neighboring file. This is probably the most important
1168 data in meta:
1170 merged:
1171 epoch: 1307159461.94575
1173 This keeps track of the highest epoch we would find if we looked into
1174 the next RECENT file.
1176 Another entry is the minmax, eg:
1178 minmax:
1179 max: 1307161441.97444
1180 min: 1307140103.70322
1182 The merged/epoch and minmax examples above illustrate one case of an
1183 overlap (130715... is between 130716... and 130714...). The syncing
1184 strategy for the client is in general the imperative: if the interval
1185 covered by a recentfile (minmax) and the interval covered by the next
1186 higher recentfile (merged/epoch) do not overlap anymore, then it is
1187 time to refresh the next recentfile.
1189 =head2 THE RECENT PART
1191 This is the interesting part. Every entry refers to some filesystem
1192 change (with path, epoch, type).
1194 The I<epoch> value is the point in time when some change was
1195 I<registered> but can be set to arbitrary values. Do not be tempted to
1196 believe that the entry has a direct relation to something like
1197 modification time or change time on the filesystem level. They are not
1198 reflecting release dates. (If you want exact release dates: Barbie is
1199 providing a database of them. See
1200 http://use.perl.org/~barbie/journal/37907).
1202 All these entries can be devided into two types (denoted by the
1203 I<type> attribute): C<new>s and C<delete>s. Changes and creations are
1204 C<new>s. Deletes are C<delete>s.
1206 Besides an I<epoch> and a I<type> attribute we find a third one:
1207 I<path>. This path is relative to the directory we find the
1208 I<recentfile> in.
1210 The order of the entries in the I<recentfile> is by decreasing epoch
1211 attribute. These are unique floating point numbers. When the server
1212 has ntp running correctly, then the timestamps are usually reflecting
1213 a real epoch. If time is running backwards, we trump the system epoch
1214 with strictly monotonically increasing floating point timestamps and
1215 guarantee they are unique.
1217 =head1 CORRUPTION AND RECOVERY
1219 If the origin host breaks the promise to deliver consistent and
1220 complete I<recentfiles> then it must update its C<dirtymark> and all
1221 slaves must discard what they cosider the truth.
1223 In the worst case that something goes wrong despite the dirtymark
1224 mechanism the way back to sanity can be achieved through traditional
1225 rsyncing between the hosts. But please be wary doing that: mixing
1226 traditional rsync and the F:R:M:R technique can lead to gratuitous
1227 extra errors. If you're the last host in a chain, there's nobody you
1228 can disturb, but if you have downstream clients, it is possible that
1229 rsync copies a RECENT file before the contained files are actually
1230 available.
1232 =head1 BACKGROUND
1234 This is about speeding up rsync operation on large trees. Uses a small
1235 metadata cocktail and pull technology.
1237 rersyncrecent solves this problem with a couple of (usually 2-10)
1238 lightweight index files which cover different overlapping time
1239 intervals. The master writes these files and the clients/slaves can
1240 construct the full tree from the information contained in them. The
1241 most recent index file usually covers the last seconds or minutes or
1242 hours of the tree and depending on the needs, slaves can rsync every
1243 few seconds or minutes and then bring their trees in full sync.
1245 The rersyncrecent model was developed for CPAN but as it is both
1246 convenient and economic it is also a general purpose solution. I'm
1247 looking forward to see a CPAN backbone that is only a few seconds
1248 behind PAUSE.
1250 =head2 NON-COMPETITORS
1252 File::Mirror JWU/File-Mirror/File-Mirror-0.10.tar.gz only local trees
1253 Mirror::YAML ADAMK/Mirror-YAML-0.03.tar.gz some sort of inner circle
1254 Net::DownloadMirror KNORR/Net-DownloadMirror-0.04.tar.gz FTP sites and stuff
1255 Net::MirrorDir KNORR/Net-MirrorDir-0.05.tar.gz dito
1256 Net::UploadMirror KNORR/Net-UploadMirror-0.06.tar.gz dito
1257 Pushmi::Mirror CLKAO/Pushmi-v1.0.0.tar.gz something SVK
1259 rsnapshot www.rsnapshot.org focus on backup
1260 csync www.csync.org more like unison
1261 multi-rsync sourceforge 167893 lan push to many
1262 chasm chasmd.org per-directory manifests
1264 =head2 COMPETITORS
1266 The problem to solve which clusters and ftp mirrors and otherwise
1267 replicated datasets like CPAN share: how to transfer only a minimum
1268 amount of data to determine the diff between two hosts.
1270 Normally it takes a long time to determine the diff itself before it
1271 can be transferred. Known solutions at the time of this writing are
1272 csync2, and rsync 3 batch mode.
1274 For many years the best solution was B<csync2> which solves the
1275 problem by maintaining a sqlite database on both ends and talking a
1276 highly sophisticated protocol to quickly determine which files to send
1277 and which to delete at any given point in time. Csync2 is often
1278 inconvenient because it is push technology and the act of syncing
1279 demands quite an intimate relationship between the sender and the
1280 receiver. This is hard to achieve in an environment of loosely coupled
1281 sites where the number of sites is large or connections are unreliable
1282 or network topology is changing.
1284 B<Rsync 3 batch mode> works around these problems by providing
1285 rsync-able batch files which allow receiving nodes to replay the
1286 history of the other nodes. This reduces the need to have an
1287 incestuous relation but it has the disadvantage that these batch files
1288 replicate the contents of the involved files. This seems inappropriate
1289 when the nodes already have a means of communicating over rsync.
1291 =head2 HONORABLE MENTION
1293 B<instantmirror> at https://fedorahosted.org/InstantMirror/ is an
1294 ambitious project that tries to combine various technologies (squid,
1295 bittorrent) to overcome the current slowness with the main focus on
1296 fedora. It's been founded in 2009-03 and at the time of this writing
1297 it is still a bit early to comment on.
1299 =head1 LIMITATIONS
1301 If the tree of the master server is changing faster than the bandwidth
1302 permits to mirror then additional protocols may need to be deployed.
1303 Certainly p2p/bittorrent can help in such situations because
1304 downloading sites help each other and bittorrent chunks large files
1305 into pieces.
1307 =head1 INOTIFY
1309 Currently the origin server has two options. The traditional one is to
1310 strictly keep track of injected and removed files through all involved
1311 processes and call C<update> on every file system event. The other
1312 option is to let data come in and use the assistance of inotify. PAUSE
1313 is running the former, the cpan master site is running the latter.
1314 Both work equally well for CPAN because CPAN has not yet had any
1315 problem with upload storms. On installations that have to deal with
1316 more uploaded data than inotify+rrr can handle it's better to use the
1317 traditional method such that the relevant processes can build up some
1318 backpressure to throttle writing processes until we're ready to accept
1319 the next data chunk.
1321 =head1 FUTURE DIRECTIONS
1323 Convince other users outside the CPAN like
1324 http://fedoraproject.org/wiki/Infrastructure/Mirroring
1326 =head1 SEE ALSO
1328 L<File::Rsync::Mirror::Recentfile>,
1329 L<File::Rsync::Mirror::Recentfile::Done>,
1330 L<File::Rsync::Mirror::Recentfile::FakeBigFloat>
1332 =head1 BUGS
1334 Please report any bugs or feature requests through the web interface
1336 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=File-Rsync-Mirror-Recent>.
1337 I will be notified, and then you'll automatically be notified of
1338 progress on your bug as I make changes.
1340 =head1 SUPPORT
1342 You can find documentation for this module with the perldoc command.
1344 perldoc File::Rsync::Mirror::Recent
1346 You can also look for information at:
1348 =over 4
1350 =item * RT: CPAN's request tracker
1352 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=File-Rsync-Mirror-Recent>
1354 =item * AnnoCPAN: Annotated CPAN documentation
1356 L<http://annocpan.org/dist/File-Rsync-Mirror-Recent>
1358 =item * CPAN Ratings
1360 L<http://cpanratings.perl.org/d/File-Rsync-Mirror-Recent>
1362 =item * Search CPAN
1364 L<http://search.cpan.org/dist/File-Rsync-Mirror-Recent>
1366 =back
1369 =head1 ACKNOWLEDGEMENTS
1371 Thanks to RJBS for module-starter.
1373 =head1 AUTHOR
1375 Andreas König
1377 =head1 COPYRIGHT & LICENSE
1379 Copyright 2008, 2009 Andreas König.
1381 This program is free software; you can redistribute it and/or modify it
1382 under the same terms as Perl itself.
1385 =cut
1387 1; # End of File::Rsync::Mirror::Recent