documentation improvements (site seeing tour, etc)
[rersyncrecent.git] / lib / File / Rsync / Mirror / Recent.pm
blob7b80486302a4c19410d2224d2e3dd9066c4ae153
1 package File::Rsync::Mirror::Recent;
3 # use warnings;
4 use strict;
5 use File::Rsync::Mirror::Recentfile;
7 =encoding utf-8
9 =head1 NAME
11 File::Rsync::Mirror::Recent - mirroring via rsync made efficient
13 =cut
15 package File::Rsync::Mirror::Recent;
17 use File::Basename qw(basename dirname fileparse);
18 use File::Copy qw(cp);
19 use File::Path qw(mkpath);
20 use File::Rsync;
21 use File::Rsync::Mirror::Recentfile::Done (); # at least needed by thaw()
22 use File::Rsync::Mirror::Recentfile::FakeBigFloat qw(:all);
23 use File::Temp;
24 use List::Pairwise qw(mapp grepp);
25 use List::Util qw(first max);
26 use Scalar::Util qw(blessed reftype);
27 use Storable;
28 use Time::HiRes qw();
29 use YAML::Syck;
31 use version; our $VERSION = qv('0.3.0');
33 =head1 SYNOPSIS
35 The documentation in here is normally not needed because the code is
36 considered to be run from several standalone programs. For a quick
37 overview, see the file README.mirrorcpan and the bin/ directory of the
38 distribution. For the architectural ideas see the section THE
39 ARCHITECTURE OF A COLLECTION OF RECENTFILES below.
41 File::Rsync::Mirror::Recent establishes a view on a collection of
42 File::Rsync::Mirror::Recentfile objects and provides abstractions
43 spanning multiple time intervals associated with those.
45 =head1 EXPORT
47 No exports.
49 =head1 CONSTRUCTORS
51 =head2 my $obj = CLASS->new(%hash)
53 Constructor. On every argument pair the key is a method name and the
54 value is an argument to that method name.
56 =cut
58 sub new {
59 my($class, @args) = @_;
60 my $self = bless {}, $class;
61 while (@args) {
62 my($method,$arg) = splice @args, 0, 2;
63 $self->$method($arg);
65 return $self;
68 =head2 my $obj = CLASS->thaw($statusfile)
70 Constructor from a statusfile left over from a previous
71 rmirror run. See also C<runstatusfile>.
73 =cut
75 sub _thaw_without_pathdb {
76 my($self,$file) = @_;
77 open my $fh, $file or die "Can't open '$file': $!";
78 local $/ = "\n";
79 my $in_pathdb = 0;
80 my $tfile = File::Temp->new
82 TEMPLATE => "Recent-thaw-XXXX",
83 TMPDIR => 1,
84 UNLINK => 0,
85 CLEANUP => 0,
86 SUFFIX => '.dat',
88 my $template_for_eop;
89 while (<$fh>) {
90 if ($in_pathdb) {
91 if (/$template_for_eop/) {
92 $in_pathdb = 0;
94 } elsif (/(\s+)-\s*__pathdb\s*:/) {
95 $in_pathdb = 1;
96 my $next_attr = sprintf "^%s\\S", " ?" x length($1);
97 $template_for_eop = qr{$next_attr};
99 print $tfile $_ unless $in_pathdb;
101 close $tfile or die "Could not close: $!";
102 my $return = $self->thaw($tfile->filename);
103 $return->_havelostpathdb(1);
104 unlink $tfile->filename;
105 return $return;
107 sub thaw {
108 my($self, $file) = @_;
109 die "thaw called without statusfile argument" unless defined $file;
110 unless (-e $file){
111 require Carp;
112 Carp::confess("Alert: statusfile '$file' not found");
114 require YAML::Syck;
115 my $start = time;
116 my $sleeptime = 0.02;
117 while (not mkdir "$file.lock") {
118 my $err = $!;
119 Time::HiRes::sleep $sleeptime;
120 my $waiting = time - $start;
121 if ($waiting >= 3){
122 warn "*** waiting ($waiting) for lock ($err) ***";
123 $sleeptime = 1;
126 my $size = -s $file;
127 my $serialized = YAML::Syck::LoadFile($file);
128 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
129 my $charged_self = $serialized->{reduced_self};
130 my $class = blessed $self;
131 bless $charged_self, $class;
132 my $rfs = $serialized->{reduced_rfs};
133 my $rfclass = $class . "file"; # "Recent" . "file"
134 my $pathdb = $charged_self->_pathdb;
135 for my $rf (@$rfs) {
136 bless $rf, $rfclass;
137 $rf->_pathdb($pathdb);
139 $charged_self->_recentfiles($rfs);
140 $charged_self->_principal_recentfile($rfs->[0]);
141 # die "FIXME: thaw all recentfiles from reduced_rfs into _recentfiles as well, watch out for pathdb and rsync";
142 return $charged_self;
145 =head1 ACCESSORS
147 =cut
149 my @accessors;
151 BEGIN {
152 @accessors =
154 "__pathdb",
155 "_dirtymark", # keeps track of the dirtymark of the recentfiles
156 "_havelostpathdb", # boolean
157 "_have_written_statusfile", # boolean
158 "_logfilefordone", # turns on _logfile on all DONE
159 # systems (disk intensive)
160 "_max_one_state", # when we have no time left but want
161 # at least get one file per
162 # iteration to avoid procrastination
163 "_principal_recentfile",
164 "_recentfiles",
165 "_rsync",
166 "_runstatusfile", # occasionally dumps all rfs
167 "_verbose", # internal variable for verbose setter/getter
168 "_verboselog", # internal variable for verboselog setter/getter
171 my @pod_lines =
172 split /\n/, <<'=cut'; push @accessors, grep {s/^=item\s+//} @pod_lines; }
174 =over 4
176 =item ignore_link_stat_errors
178 as in F:R:M:Recentfile
180 =item local
182 Option to specify the local principal file for operations with a local
183 collection of recentfiles.
185 =item localroot
187 as in F:R:M:Recentfile
189 =item max_files_per_connection
191 as in F:R:M:Recentfile
193 =item remote
195 The remote principal recentfile in rsync notation. E.g.
197 pause.perl.org::authors/RECENT.recent
199 =item remoteroot
201 as in F:R:M:Recentfile
203 =item remote_recentfile
205 Rsync address of the remote C<RECENT.recent> symlink or whichever name
206 the principal remote recentfile has.
208 =item rsync_options
210 Things like compress, links, times or checksums. Passed in to the
211 File::Rsync object used to run the mirror.
213 =item tempdir
215 as in F:R:M:Recentfile
217 =item ttl
219 Minimum time before fetching the principal recentfile again.
221 =back
223 =cut
225 use accessors @accessors;
227 =head1 METHODS
229 =head2 $arrayref = $obj->news ( %options )
231 Test this with:
233 perl -Ilib bin/rrr-news \
234 -after 1217200539 \
235 -max 12 \
236 -local /home/ftp/pub/PAUSE/authors/RECENT.recent
238 perl -Ilib bin/rrr-news \
239 -after 1217200539 \
240 -rsync=compress=1 \
241 -rsync=links=1 \
242 -localroot /home/ftp/pub/PAUSE/authors/ \
243 -remote pause.perl.org::authors/RECENT.recent
244 -verbose
246 All parameters that can be passed to
247 File:Rsync:Mirror:Recentfile::recent_events() can also be specified
248 here.
250 One additional option is supported. If C<$Options{callback}> is
251 specified, it must be a subref. This sub is called whenever one chunk
252 of events is found. The first argument to the callback is a reference
253 to the currently accumulated array of events.
255 Note: all data are kept in memory.
257 =cut
259 sub news {
260 my($self, %opt) = @_;
261 my $local = $self->local;
262 unless ($local) {
263 if (my $remote = $self->remote) {
264 my $localroot;
265 if ($localroot = $self->localroot) {
266 # nice, they know what they are doing
267 } else {
268 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
270 } else {
271 die "Alert: neither local nor remote specified, cannot continue";
274 my $rfs = $self->recentfiles;
275 my $ret = [];
276 my $before;
277 for my $rf (@$rfs) {
278 my %locopt = %opt;
279 $locopt{before} = $before;
280 if ($opt{max}) {
281 $locopt{max} -= scalar @$ret;
282 last if $locopt{max} <= 0;
284 $locopt{info} = {};
285 my $res = $rf->recent_events(%locopt);
286 if (@$res){
287 push @$ret, @$res;
289 if ($opt{max} && scalar @$ret > $opt{max}) {
290 last;
292 if ($opt{after}){
293 if ( $locopt{info}{last} && _bigfloatlt($locopt{info}{last}{epoch},$opt{after}) ) {
294 last;
296 if ( _bigfloatgt($opt{after},$locopt{info}{first}{epoch}) ) {
297 last;
300 if (!@$res){
301 next;
303 $before = $res->[-1]{epoch};
304 $before = $opt{before} if $opt{before} && _bigfloatlt($opt{before},$before);
305 if (my $sub = $opt{callback}) {
306 $sub->($ret);
309 $ret;
312 =head2 overview ( %options )
314 returns a small table that summarizes the state of all recentfiles
315 collected in this Recent object.
317 $options{verbose}=1 increases the number of columns displayed.
319 Here is an example output:
321 Ival Cnt Max Min Span Util Cloud
322 1h 47 1225053014.38 1225049650.91 3363.47 93.4% ^ ^
323 6h 324 1225052939.66 1225033394.84 19544.82 90.5% ^ ^
324 1d 437 1225049651.53 1224966402.53 83248.99 96.4% ^ ^
325 1W 1585 1225039015.75 1224435339.46 603676.29 99.8% ^ ^
326 1M 5855 1225017376.65 1222428503.57 2588873.08 99.9% ^ ^
327 1Q 17066 1224578930.40 1216803512.90 7775417.50 100.0% ^ ^
328 1Y 15901 1223966162.56 1216766820.67 7199341.89 22.8% ^ ^
329 Z 9909 1223966162.56 1216766820.67 7199341.89 - ^ ^
331 I<Max> is the name of the interval.
333 I<Cnt> is the number of entries in this recentfile.
335 I<Max> is the highest(first) epoch in this recentfile, rounded.
337 I<Min> is the lowest(last) epoch in this recentfile, rounded.
339 I<Span> is the timespan currently covered, rounded.
341 I<Util> is I<Span> devided by the designated timespan of this
342 recentfile.
344 I<Cloud> is ascii art illustrating the sequence of the Max and Min
345 timestamps.
347 =cut
348 sub overview {
349 my($self,%options) = @_;
350 my $rfs = $self->recentfiles;
351 my(@s,%rank);
352 RECENTFILE: for my $rf (@$rfs) {
353 my $re=$rf->recent_events;
354 my $rfsummary;
355 if (@$re) {
356 my $span = $re->[0]{epoch}-$re->[-1]{epoch};
357 my $merged = $rf->merged;
358 $rfsummary =
360 "Ival",
361 $rf->interval,
362 "Cnt",
363 scalar @$re,
364 "Dirtymark",
365 $rf->dirtymark ? sprintf("%.2f",$rf->dirtymark) : "-",
366 "Produced",
367 sprintf ("%.2f", $rf->{ORIG}{Producers}{time}||0),
368 "Merged",
369 ($rf->interval eq "Z"
373 sprintf ("%.2f", $merged->{epoch} || 0)),
374 "Max",
375 sprintf ("%.2f", $re->[0]{epoch}),
376 "Min",
377 sprintf ("%.2f", $re->[-1]{epoch}),
378 "Span",
379 sprintf ("%.2f", $span),
380 "Util", # u9n:)
381 ($rf->interval eq "Z"
385 sprintf ("%5.1f%%", 100 * $span / $rf->interval_secs)
388 @rank{mapp {$b} grepp {$a =~ /^(Max|Min)$/} @$rfsummary} = ();
389 } else {
390 next RECENTFILE;
392 push @s, $rfsummary;
394 @rank{sort {$b <=> $a} keys %rank} = 1..keys %rank;
395 my $maxrank = max values %rank;
396 for my $rfsummary (@s) {
397 my $string = " " x $maxrank;
398 my @borders;
399 for my $ele (qw(Max Min)) {
400 my($r) = mapp {$b} grepp {$a eq $ele} @$rfsummary;
401 push @borders, $rank{$r}-1;
403 for ($borders[0],$borders[1]) {
404 substr($string,$_,1) = "^";
406 push @$rfsummary, "Cloud", $string;
408 unless ($options{verbose}) {
409 my %filter = map {($_=>1)} qw(Ival Cnt Max Min Span Util Cloud);
410 for (@s) {
411 $_ = [mapp {($a,$b)} grepp {!!$filter{$a}} @$_];
414 my @sprintf;
415 for (my $i = 0; $i <= $#{$s[0]}; $i+=2) {
416 my $maxlength = max ((map { length $_->[$i+1] } @s), length $s[0][$i]);
417 push @sprintf, "%" . $maxlength . "s";
419 my $sprintf = join " ", @sprintf;
420 $sprintf .= "\n";
421 my $headline = sprintf $sprintf, mapp {$a} @{$s[0]};
422 join "", $headline, map { sprintf $sprintf, mapp {$b} @$_ } @s;
425 =head2 _pathdb
427 Keeping track of already handled files. Currently it is a hash, will
428 probably become a database with its own accessors.
430 =cut
432 sub _pathdb {
433 my($self, $set) = @_;
434 if ($set) {
435 $self->__pathdb ($set);
437 my $pathdb = $self->__pathdb;
438 unless (defined $pathdb) {
439 $self->__pathdb(+{});
441 return $self->__pathdb;
444 =head2 $recentfile = $obj->principal_recentfile ()
446 returns the principal recentfile object of this tree.
448 =cut
449 # mirrors the recentfile and instantiates the recentfile object
450 sub _principal_recentfile_fromremote {
451 my($self) = @_;
452 # get the remote recentfile
453 my $rrfile = $self->remote or die "Alert: cannot construct a recentfile object without the 'remote' attribute";
454 my $splitter = qr{(.+)/([^/]*)};
455 my($remoteroot,$rfilename) = $rrfile =~ $splitter;
456 $self->remoteroot($remoteroot);
457 my($abslfile, $fh);
458 if (!defined $rfilename) {
459 die "Alert: Cannot resolve '$rrfile', does not match $splitter";
460 } elsif (not length $rfilename or $rfilename eq "RECENT.recent") {
461 ($abslfile,$rfilename,$fh) = $self->_principal_recentfile_fromremote_resosymlink($rfilename);
463 my @need_args =
465 "ignore_link_stat_errors",
466 "localroot",
467 "max_files_per_connection",
468 "remoteroot",
469 "rsync_options",
470 "tempdir",
471 "ttl",
472 "verbose",
473 "verboselog",
475 my $rf0;
476 unless ($abslfile) {
477 $rf0 = File::Rsync::Mirror::Recentfile->new (map {($_ => $self->$_)} @need_args);
478 $rf0->split_rfilename($rfilename);
479 $abslfile = $rf0->get_remote_recentfile_as_tempfile ();
481 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ( $abslfile );
482 $rf0->_current_tempfile ( $abslfile );
483 $rf0->_current_tempfile_fh ( $fh );
484 $rf0->_use_tempfile (1);
485 for my $override (@need_args) {
486 $rf0->$override ( $self->$override );
488 $rf0->is_slave (1);
489 return $rf0;
491 sub principal_recentfile {
492 my($self) = @_;
493 my $rf0 = $self->_principal_recentfile;
494 return $rf0 if defined $rf0;
495 my $local = $self->local;
496 if ($local) {
497 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ($local);
498 } else {
499 if (my $remote = $self->remote) {
500 my $localroot;
501 if ($localroot = $self->localroot) {
502 # nice, they know what they are doing
503 } else {
504 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
506 $rf0 = $self->_principal_recentfile_fromremote;
507 } else {
508 die "Alert: neither local nor remote specified, cannot continue";
511 $self->_principal_recentfile($rf0);
512 return $rf0;
515 =head2 $recentfiles_arrayref = $obj->recentfiles ()
517 returns a reference to the complete list of recentfile objects that
518 describe this tree. No guarantee is given that the represented
519 recentfiles exist or have been read. They are just bare objects.
521 =cut
523 sub recentfiles {
524 my($self) = @_;
525 my $rfs = $self->_recentfiles;
526 return $rfs if defined $rfs;
527 my $rf0 = $self->principal_recentfile;
528 my $pathdb = $self->_pathdb;
529 $rf0->_pathdb ($pathdb);
530 my $aggregator = $rf0->aggregator;
531 my @rf = $rf0;
532 for my $agg (@$aggregator) {
533 my $nrf = $rf0->_sparse_clone;
534 $nrf->interval ( $agg );
535 $nrf->have_mirrored ( 0 );
536 $nrf->_pathdb ( $pathdb );
537 push @rf, $nrf;
539 $self->_recentfiles(\@rf);
540 return \@rf;
543 =head2 $success = $obj->rmirror ( %options )
545 Mirrors all recentfiles of the I<remote> address working through all
546 of them, mirroring their contents.
548 Test this with:
550 use File::Rsync::Mirror::Recent;
551 my $rrr = File::Rsync::Mirror::Recent->new(
552 ignore_link_stat_errors => 1,
553 localroot => "/home/ftp/pub/PAUSE/authors",
554 remote => "pause.perl.org::authors/RECENT.recent",
555 max_files_per_connection => 5000,
556 rsync_options => {
557 compress => 1,
558 links => 1,
559 times => 1,
560 checksum => 0,
562 verbose => 1,
563 _runstatusfile => "recent-rmirror-state.yml",
564 _logfilefordone => "recent-rmirror-donelog.log",
566 $rrr->rmirror ( "skip-deletes" => 1, loop => 1 );
568 Or try without the loop parameter and write the loop yourself:
570 use File::Rsync::Mirror::Recent;
571 my @rrr;
572 for my $t ("authors","modules"){
573 my $rrr = File::Rsync::Mirror::Recent->new(
574 ignore_link_stat_errors => 1,
575 localroot => "/home/ftp/pub/PAUSE/$t",
576 remote => "pause.perl.org::$t/RECENT.recent",
577 max_files_per_connection => 512,
578 rsync_options => {
579 compress => 1,
580 links => 1,
581 times => 1,
582 checksum => 0,
584 verbose => 1,
585 _runstatusfile => "recent-rmirror-state-$t.yml",
586 _logfilefordone => "recent-rmirror-donelog-$t.log",
587 ttl => 5,
589 push @rrr, $rrr;
591 while (){
592 for my $rrr (@rrr){
593 $rrr->rmirror ( "skip-deletes" => 1 );
595 warn "sleeping 23\n"; sleep 23;
599 =cut
600 # _alluptodate is unused but at least it worked last time I needed it,
601 # so let us keep it around
602 sub _alluptodate {
603 my($self) = @_;
604 my $sdm = $self->_dirtymark;
605 return unless defined $sdm;
606 for my $rf (@{$self->recentfiles}) {
607 return if $rf->seeded;
608 my $rfdm = $rf->dirtymark;
609 return unless defined $rfdm;
610 return unless $rfdm eq $sdm;
611 my $done = $rf->done;
612 return unless defined $done;
613 my $done_intervals = $done->_intervals;
614 return if !defined $done_intervals;
615 # nonono, may be more than one, only covered it must be:
616 # return if @$done_intervals > 1;
617 my $minmax = $rf->minmax;
618 return unless defined $minmax;
619 return unless $done->covered(@$minmax{qw(max min)});
621 # $DB::single++;
622 return 1;
624 sub _fullseed {
625 my($self) = @_;
626 for ( @{$self->recentfiles} ) { $_->seed(1) }
628 sub rmirror {
629 my($self, %options) = @_;
631 my $rfs = $self->recentfiles;
633 $self->principal_recentfile->seed;
634 my $_sigint = sub {
635 # XXX exit gracefully (reminder)
638 # XXX needs accessor: warning, if set too low, we do nothing but
639 # mirror the principal!
640 my $minimum_time_per_loop = 20;
642 if (my $logfile = $self->_logfilefordone) {
643 for my $i (0..$#$rfs) {
644 $rfs->[$i]->done->_logfile($logfile);
647 if (my $dirtymark = $self->principal_recentfile->dirtymark) {
648 my $mydm = $self->_dirtymark;
649 if (!defined $mydm){
650 $self->_dirtymark($dirtymark);
651 } elsif ($dirtymark ne $mydm) {
652 if ($self->verbose) {
653 my $fh;
654 if (my $vl = $self->verboselog) {
655 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
656 } else {
657 $fh = \*STDERR;
659 print $fh "NewDirtymark: old[$mydm] new[$dirtymark]\n";
661 $self->_dirtymark($dirtymark);
664 my $rstfile = $self->runstatusfile;
665 unless ($self->_have_written_statusfile) {
666 $self->_rmirror_runstatusfile_write ($rstfile, \%options);
667 $self->_have_written_statusfile(1);
669 $self->_rmirror_loop($minimum_time_per_loop,\%options);
672 sub _rmirror_loop {
673 my($self,$minimum_time_per_loop,$options) = @_;
674 LOOP: while () {
675 my $ttleave = time + $minimum_time_per_loop;
676 my $rstfile = $self->runstatusfile;
677 my $otherproc = $self->_thaw_without_pathdb ($rstfile);
678 my $pid = fork;
679 if (! defined $pid) {
680 warn "Contention: $!";
681 sleep 0.25;
682 next LOOP;
683 } elsif ($pid) {
684 waitpid($pid,0);
685 } else {
686 $self = $self->thaw ($rstfile);
687 my $rfs = $self->recentfiles;
688 $self->principal_recentfile->seed;
689 RECENTFILE: for my $i (0..$#$rfs) {
690 my $rf = $rfs->[$i];
691 if (time > $ttleave) {
692 # Must make sure that one file can get fetched in any case
693 $self->_max_one_state(1);
695 if ($rf->seeded) {
696 $self->_rmirror_mirror ($i, $options);
697 } elsif ($rf->uptodate) {
698 if ($i < $#$rfs) {
699 $rfs->[$i+1]->done->merge($rf->done);
701 # no further seed necessary because "periodic" does it
702 next RECENTFILE;
704 WORKUNIT: while (time < $ttleave) {
705 if ($rf->uptodate) {
706 $self->_rmirror_sleep_per_connection ($i);
707 next RECENTFILE;
708 } else {
709 $self->_rmirror_mirror ($i, $options);
712 if ($self->_max_one_state) {
713 last RECENTFILE;
716 $self->_max_one_state(0);
717 my $exit = 0;
718 if ($rfs->[-1]->uptodate) {
719 $self->_rmirror_cleanup;
721 unless ($options->{loop}) {
722 $exit = 1;
724 $self->_rmirror_runstatusfile_write ($rstfile, $options);
725 exit if $exit;
726 last LOOP;
729 $otherproc = $self->_thaw_without_pathdb ($rstfile);
730 if (!$options->{loop} && $otherproc && $otherproc->recentfiles->[-1]->uptodate) {
731 last LOOP;
733 my $sleep = $ttleave - time;
734 if ($sleep > 0.01) {
735 $self->_rmirror_endofloop_sleep ($sleep);
736 } else {
737 # negative time not invented yet:)
742 sub _rmirror_mirror {
743 my($self, $i, $options) = @_;
744 my $rfs = $self->recentfiles;
745 my $rf = $rfs->[$i];
746 my %locopt = %$options;
747 if ($self->_max_one_state) {
748 $locopt{max} = 1;
750 $locopt{piecemeal} = 1;
751 $rf->mirror (%locopt);
752 if ($i==0) {
753 # we limit to 0 for the case that upstream is broken and has
754 # more than one timestamp (happened on PAUSE 200903)
755 if (my $dirtymark = $rf->dirtymark) {
756 my $mydm = $self->_dirtymark;
757 if (!defined $mydm or $dirtymark ne $mydm) {
758 $self->_dirtymark($dirtymark);
759 $self->_fullseed;
765 sub _rmirror_sleep_per_connection {
766 my($self, $i) = @_;
767 my $rfs = $self->recentfiles;
768 my $rf = $rfs->[$i];
769 my $sleep = $rf->sleep_per_connection;
770 $sleep = 0.42 unless defined $sleep;
771 Time::HiRes::sleep $sleep;
772 $rfs->[$i+1]->done->merge($rf->done) if $i < $#$rfs;
775 sub _rmirror_cleanup {
776 my($self) = @_;
777 my $pathdb = $self->_pathdb();
778 for my $k (keys %$pathdb) {
779 delete $pathdb->{$k};
781 my $rfs = $self->recentfiles;
782 for my $i (0..$#$rfs-1) {
783 my $thismerged = $rfs->[$i]->merged;
784 my $next = $rfs->[$i+1];
785 my $nextminmax = $next->minmax;
786 if (not defined $thismerged->{epoch} or _bigfloatlt($nextminmax->{max},$thismerged->{epoch})){
787 $next->seed;
792 =head2 $file = $obj->runstatusfile ($set)
794 Getter/setter for C<_runstatusfile> attribute. Defaults to a temporary
795 file created by C<File::Temp>. A status file is required for
796 C<rmirror> working. Since it may be interesting for debugging
797 purposes, you may want to specify a permanent file for this.
799 =cut
800 sub runstatusfile {
801 my($self,$set) = @_;
802 if (defined $set) {
803 $self->_runstatusfile ($set);
805 my $x = $self->_runstatusfile;
806 unless (defined $x) {
807 require File::Temp;
808 my $tfile = File::Temp->new
810 TEMPLATE => "Recent-XXXX",
811 TMPDIR => 1,
812 UNLINK => 0,
813 CLEANUP => 0,
814 SUFFIX => '.dat',
816 $self->_runstatusfile($tfile->filename);
818 return $self->_runstatusfile;
821 # unused code.... it was an oops, discovered the thaw() method too
822 # late, and starting writing this here....
823 sub _rmirror_runstatusfile_read {
824 my($self, $file) = @_;
826 require YAML::Syck;
827 my $start = time;
828 # XXX is locking useful here?
829 while (not mkdir "$file.lock") {
830 Time::HiRes::sleep 0.2;
831 warn "*** waiting for lock ***" if time - $start >= 3;
833 my $yml = YAML::Syck::LoadFile $file;
834 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
835 my $rself = $yml->{reduced_self};
836 my $rfs = $yml->{reduced_rfs};
837 # XXX bring them into self
840 sub _rmirror_runstatusfile_write {
841 my($self, $file, $options) = @_;
842 my $rself;
843 while (my($k,$v) = each %$self) {
844 next if $k =~ /^-(_principal_recentfile|_recentfiles)$/;
845 $rself->{$k} = $v;
847 my $rfs = $self->recentfiles;
848 my $rrfs;
849 for my $i (0..$#$rfs) {
850 my $rf = $rfs->[$i];
851 while (my($k,$v) = each %$rf) {
852 next if $k =~ /^-(_current_tempfile_fh|_pathdb|_rsync)$/;
853 $rrfs->[$i]{$k} = $rfs->[$i]{$k};
856 require YAML::Syck;
857 my $start = time;
858 while (not mkdir "$file.lock") {
859 Time::HiRes::sleep 0.15;
860 warn "*** waiting for lock directory '$file.lock' ***" if time - $start >= 3;
862 YAML::Syck::DumpFile
864 "$file.new",
866 options => $options,
867 time => time,
868 reduced_rfs => $rrfs,
869 reduced_self => $rself,
871 rename "$file.new", $file or die "Could not rename: $!";
872 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
875 sub _rmirror_endofloop_sleep {
876 my($self, $sleep) = @_;
877 if ($self->verbose) {
878 my $fh;
879 if (my $vl = $self->verboselog) {
880 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
881 } else {
882 $fh = \*STDERR;
884 printf $fh
886 "Dorm %d (%s secs)\n",
887 time,
888 $sleep,
891 sleep $sleep;
894 # it returns two things: abslfile and rfilename. But the abslfile is
895 # undef when the rfilename ends in .recent. A weird interface, my
896 # friend.
897 sub _principal_recentfile_fromremote_resosymlink {
898 my($self, $rfilename) = @_;
899 $rfilename = "RECENT.recent" unless length $rfilename;
900 my $abslfile = undef;
901 my $fh;
902 if ($rfilename =~ /\.recent$/) {
903 # may be a file *or* a symlink,
904 ($abslfile,$fh) = $self->_fetch_as_tempfile ($rfilename);
905 while (-l $abslfile) {
906 my $symlink = readlink $abslfile;
907 if ($symlink =~ m|/|) {
908 die "FIXME: filenames containing '/' not supported, got '$symlink'";
910 my $localrfile = File::Spec->catfile($self->localroot, $rfilename);
911 if (-e $localrfile) {
912 my $old_symlink = readlink $localrfile;
913 if ($old_symlink eq $symlink) {
914 unlink $abslfile or die "Cannot unlink '$abslfile': $!";
915 } else {
916 unlink $localrfile; # may fail
917 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
919 } else {
920 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
922 ($abslfile,$fh) = $self->_fetch_as_tempfile ($symlink);
925 return ($abslfile, $rfilename, $fh);
928 # takes a basename, returns an absolute name, does not delete the
929 # file, throws the $fh away. Caller must rename or unlink
931 # XXX needs to activate the fh in the rf0 so that it is able to unlink
932 # the file. I would like that the file is used immediately by $rf0
933 sub _fetch_as_tempfile {
934 my($self, $rfile) = @_;
935 my($suffix) = $rfile =~ /(\.[^\.]+)$/;
936 $suffix = "" unless defined $suffix;
937 my $fh = File::Temp->new
938 (TEMPLATE => sprintf(".FRMRecent-%s-XXXX",
939 $rfile,
941 DIR => $self->tempdir || $self->localroot,
942 SUFFIX => $suffix,
943 UNLINK => 0,
945 my $rsync;
946 unless ($rsync = File::Rsync->new($self->rsync_options)) {
947 require Carp;
948 Carp::confess(YAML::Syck::Dump($self->rsync_options));
950 my $dst = $fh->filename;
951 local($ENV{LANG}) = "C";
952 $rsync->exec
954 src => join("/",$self->remoteroot,$rfile),
955 dst => $dst,
956 ) or die "Could not mirror '$rfile' to $fh\: ".join(" ",$rsync->err);
957 unless (-l $dst) {
958 my $mode = 0644;
959 chmod $mode, $dst or die "Could not chmod $mode '$dst': $!";
961 return($dst,$fh);
964 =head2 $verbose = $obj->verbose ( $set )
966 Getter/setter method to set verbosity for this F:R:M:Recent object and
967 all associated Recentfile objects.
969 =cut
970 sub verbose {
971 my($self,$set) = @_;
972 if (defined $set) {
973 for ( @{$self->recentfiles} ) { $_->verbose($set) }
974 $self->_verbose ($set);
976 my $x = $self->_verbose;
977 unless (defined $x) {
978 $x = 0;
979 $self->_verbose ($x);
981 return $x;
985 =head2 my $vl = $obj->verboselog ( $set )
987 Getter/setter method for the path to the logfile to write verbose
988 progress information to.
990 Note: This is a primitive stop gap solution to get simple verbose
991 logging working. The program still sends error messages to STDERR.
992 Switching to Log4perl or similar is probably the way to go. TBD.
994 =cut
995 sub verboselog {
996 my($self,$set) = @_;
997 if (defined $set) {
998 for ( @{$self->recentfiles} ) { $_->verboselog($set) }
999 $self->_verboselog ($set);
1001 my $x = $self->_verboselog;
1002 unless (defined $x) {
1003 $x = 0;
1004 $self->_verboselog ($x);
1006 return $x;
1009 =head1 THE ARCHITECTURE OF A COLLECTION OF RECENTFILES
1011 The idea is that we want to have a short file that records really
1012 recent changes. So that a fresh mirror can be kept fresh as long as
1013 the connectivity is given. Then we want longer files that record the
1014 history before. So when the mirror falls behind the update period
1015 reflected in the shortest file, it can complement the list of recent
1016 file events with the next one. And if this is not long enough we want
1017 another one, again a bit longer. And we want one that completes the
1018 history back to the oldest file. The index files together do contain
1019 the complete list of current files. The longer a period covered by an
1020 index file is gone the less often the index file is updated. For
1021 practical reasons adjacent files will often overlap a bit but this is
1022 neither necessary nor enforced. Enforced is only that there must not
1023 ever be a gap between two adjacent index files. That's the basic idea.
1024 The following example represents a tree that has a few updates every
1025 day:
1027 RECENT.recent -> RECENT-1h.yaml
1028 RECENT-1h.yaml
1029 RECENT-6h.yaml
1030 RECENT-1d.yaml
1031 RECENT-1M.yaml
1032 RECENT-1W.yaml
1033 RECENT-1Q.yaml
1034 RECENT-1Y.yaml
1035 RECENT-Z.yaml
1037 Each of these files represents a contract to hold a record for every
1038 filesystem event within the period indicated in the filename.
1040 The first file is the principal file, in so far it is the one that is
1041 written first after a filesystem change. Usually a symlink links to it
1042 with a filename that has the same filenameroot and the suffix
1043 C<.recent>. On systems that do not support symlinks there is a plain
1044 copy maintained instead.
1046 The last file, the Z file, contains the complementary files that are
1047 in none of the other files. It may contain C<delete> events but often
1048 C<delete> events are discarded at the transition to the Z file.
1050 =head2 SITE SEEING TOUR
1052 This section illustrates the operation of a server-client couple in a
1053 fictious installation that has to deal with a long time of inactivity.
1054 I think such an edge case installation demonstrates the economic
1055 behaviour of our model of overlapping time slices best.
1057 The sleeping beauty (http://en.wikipedia.org/wiki/Sleeping_Beauty) is
1058 a classic fairytale of a princess sleeping for a hundred years. The
1059 story inspired the test case 02-aurora.t.
1061 Given an upstream server where the people stop feeding new files for
1062 one hundred years. That upstream server has no driving energy to do
1063 major changes to its RECENT files. Cronjobs will continue to shift
1064 things towards the Z file but soon will stop doing so since all of
1065 them have to keep their promise to record files covering a certain
1066 period. Soon all RECENT files will cover exactly their native period.
1068 Downstream servers will stubbornly ask their question to the rsync
1069 server whether there is a newer RECENT.recent. As soon as the smallest
1070 RECENT file has reached the state of maximum possible merge with the
1071 second smallest RECENT file, the answer of the rsync server will
1072 always be: nothing new. And downstream servers that were uptodate on
1073 the previous request will be satisfied and do nothing. Never will they
1074 request a download. The answer that there is no change is sufficient
1075 to determine that there is no change in the whole tree.
1077 Let's presume the smallest RECENT file on this castle is a 1h file and
1078 downstream decides to ask every 30 minutes. Now the hundred years are
1079 over and upstream starts producing files again. One file every minute.
1080 After one minute it will move old files over to the, say, 1d file. In
1081 the next sixty minutes it will not be allowed to move any other file
1082 over to the 1d file. At some point in time downstream will ask the
1083 obligatory question "anything new?" and it will get the current 1h
1084 file. It will recognize in the meta part of the current file which
1085 timestamps have been moved to the 1d file, it will recognize that it
1086 has all those. It will have no need to download the 1d file, it will
1087 download the missing files and be done. No second RECENT file needs to
1088 be downloaded.
1090 Downstream only decides to download another RECENT file when not doing
1091 so would result in a gap between two recent files. Such that
1092 consistency checks would become impossible. Or for potentially
1093 interested third parties, like down-down-stream servers.
1095 Downloads of RECENT files are subject to rsync optimizations in that
1096 rsync does some level of blockwise checksumming that is considered
1097 efficient to avoid copying blocks of data that have not changed. Our
1098 format is that of an ordered array, so that large blocks stay constant
1099 when elements are prepended to the array. This means we usually do not
1100 have to rsync full RECENT files. Only if they are really small, the
1101 rsync algorithm will not come into play but that's OK for small files.
1103 Upstream servers are extremely lazy in writing the larger files. See
1104 File::Rsync::Mirror::Recentfile::aggregate() for the specs. Long
1105 before the one hundred years are over, the upstream server will stop
1106 changing files. Slowly everything that existed before upstream fell
1107 asleep trickles into the Z file. Say, the second-largest RECENT file
1108 is a 1Y file and the third-largest RECENT file is a 1Q file, then it
1109 will take at least one quarter of a year that the 1Y file will be
1110 merged into the Z file. From that point in time everything will have
1111 been merged into the Z file and the server's job to call C<aggregate>
1112 regularly will become a noop. Consequently downstream will never again
1113 download anything. Just the obligatory question: anything new?
1115 =head2 THE INDIVIDUAL RECENTFILE
1117 A I<recentfile> consists of a hash that has two keys: C<meta> and
1118 C<recent>. The C<meta> part has metadata and the C<recent> part has a
1119 list of fileobjects.
1121 =head2 THE META PART
1123 Here we find things that are pretty much self explaining: all
1124 lowercase attributes are accessors and as such explained in the
1125 manpages. The uppercase attribute C<Producers> contains version
1126 information about involved software components.
1128 Even though the lowercase attributes are documented in the
1129 F:R:M:Recentfile manpage, let's focus on the important stuff to make
1130 sure nothing goes by unnoticed: meta contains the aggregator levels in
1131 use in this installation, in other words the names of the RECENT
1132 files, eg:
1134 aggregator:
1135 - 3s
1136 - 8s
1137 - 21s
1138 - 55s
1141 It contains a dirtymark telling us the timestamp of the last protocol
1142 violation of the upstream server:
1144 dirtymark: '1325093856.49272'
1146 Plus a few things convenient in a situation where we need to do some
1147 debugging.
1149 And it contains information about which timestamp is the maximum
1150 timestamp in the neighboring file. This is probably the most important
1151 data in meta:
1153 merged:
1154 epoch: 1307159461.94575
1156 This keeps track of the highest epoch we would find if we looked into
1157 the next RECENT file.
1159 Another entry is the minmax, eg:
1161 minmax:
1162 max: 1307161441.97444
1163 min: 1307140103.70322
1165 The merged/epoch and minmax examples above illustrate one case of an
1166 overlap (130715... is between 130716... and 130714...). The syncing
1167 strategy for the client is in general the imperative: if the interval
1168 covered by a recentfile (minmax) and the interval covered by the next
1169 higher recentfile (merged/epoch) do not overlap anymore, then it is
1170 time to refresh the next recentfile.
1172 =head2 THE RECENT PART
1174 This is the interesting part. Every entry refers to some filesystem
1175 change (with path, epoch, type).
1177 The I<epoch> value is the point in time when some change was
1178 I<registered> but can be set to arbitrary values. Do not be tempted to
1179 believe that the entry has a direct relation to something like
1180 modification time or change time on the filesystem level. They are not
1181 reflecting release dates. (If you want exact release dates: Barbie is
1182 providing a database of them. See
1183 http://use.perl.org/~barbie/journal/37907).
1185 All these entries can be devided into two types (denoted by the
1186 I<type> attribute): C<new>s and C<delete>s. Changes and creations are
1187 C<new>s. Deletes are C<delete>s.
1189 Besides an I<epoch> and a I<type> attribute we find a third one:
1190 I<path>. This path is relative to the directory we find the
1191 I<recentfile> in.
1193 The order of the entries in the I<recentfile> is by decreasing epoch
1194 attribute. These are unique floating point numbers. When the server
1195 has ntp running correctly, then the timestamps are usually reflecting
1196 a real epoch. If time is running backwards, we trump the system epoch
1197 with strictly monotonically increasing floating point timestamps and
1198 guarantee they are unique.
1200 =head1 CORRUPTION AND RECOVERY
1202 If the origin host breaks the promise to deliver consistent and
1203 complete I<recentfiles> then it must update its C<dirtymark> and all
1204 slaves must discard what they cosider the truth.
1206 In the worst case that something goes wrong despite the dirtymark
1207 mechanism the way back to sanity can be achieved through traditional
1208 rsyncing between the hosts. But please be wary doing that: mixing
1209 traditional rsync and the F:R:M:R technique can lead to gratuitous
1210 extra errors. If you're the last host in a chain, there's nobody you
1211 can disturb, but if you have downstream clients, it is possible that
1212 rsync copies a RECENT file before the contained files are actually
1213 available.
1215 =head1 BACKGROUND
1217 This is about speeding up rsync operation on large trees. Uses a small
1218 metadata cocktail and pull technology.
1220 rersyncrecent solves this problem with a couple of (usually 2-10)
1221 lightweight index files which cover different overlapping time
1222 intervals. The master writes these files and the clients/slaves can
1223 construct the full tree from the information contained in them. The
1224 most recent index file usually covers the last seconds or minutes or
1225 hours of the tree and depending on the needs, slaves can rsync every
1226 few seconds or minutes and then bring their trees in full sync.
1228 The rersyncrecent model was developed for CPAN but as it is both
1229 convenient and economic it is also a general purpose solution. I'm
1230 looking forward to see a CPAN backbone that is only a few seconds
1231 behind PAUSE.
1233 =head2 NON-COMPETITORS
1235 File::Mirror JWU/File-Mirror/File-Mirror-0.10.tar.gz only local trees
1236 Mirror::YAML ADAMK/Mirror-YAML-0.03.tar.gz some sort of inner circle
1237 Net::DownloadMirror KNORR/Net-DownloadMirror-0.04.tar.gz FTP sites and stuff
1238 Net::MirrorDir KNORR/Net-MirrorDir-0.05.tar.gz dito
1239 Net::UploadMirror KNORR/Net-UploadMirror-0.06.tar.gz dito
1240 Pushmi::Mirror CLKAO/Pushmi-v1.0.0.tar.gz something SVK
1242 rsnapshot www.rsnapshot.org focus on backup
1243 csync www.csync.org more like unison
1244 multi-rsync sourceforge 167893 lan push to many
1245 chasm chasmd.org per-directory manifests
1247 =head2 COMPETITORS
1249 The problem to solve which clusters and ftp mirrors and otherwise
1250 replicated datasets like CPAN share: how to transfer only a minimum
1251 amount of data to determine the diff between two hosts.
1253 Normally it takes a long time to determine the diff itself before it
1254 can be transferred. Known solutions at the time of this writing are
1255 csync2, and rsync 3 batch mode.
1257 For many years the best solution was B<csync2> which solves the
1258 problem by maintaining a sqlite database on both ends and talking a
1259 highly sophisticated protocol to quickly determine which files to send
1260 and which to delete at any given point in time. Csync2 is often
1261 inconvenient because it is push technology and the act of syncing
1262 demands quite an intimate relationship between the sender and the
1263 receiver. This is hard to achieve in an environment of loosely coupled
1264 sites where the number of sites is large or connections are unreliable
1265 or network topology is changing.
1267 B<Rsync 3 batch mode> works around these problems by providing
1268 rsync-able batch files which allow receiving nodes to replay the
1269 history of the other nodes. This reduces the need to have an
1270 incestuous relation but it has the disadvantage that these batch files
1271 replicate the contents of the involved files. This seems inappropriate
1272 when the nodes already have a means of communicating over rsync.
1274 =head2 HONORABLE MENTION
1276 B<instantmirror> at https://fedorahosted.org/InstantMirror/ is an
1277 ambitious project that tries to combine various technologies (squid,
1278 bittorrent) to overcome the current slowness with the main focus on
1279 fedora. It's been founded in 2009-03 and at the time of this writing
1280 it is still a bit early to comment on.
1282 =head1 LIMITATIONS
1284 If the tree of the master server is changing faster than the bandwidth
1285 permits to mirror then additional protocols may need to be deployed.
1286 Certainly p2p/bittorrent can help in such situations because
1287 downloading sites help each other and bittorrent chunks large files
1288 into pieces.
1290 =head1 INOTIFY
1292 Currently the origin server has two options. The traditional one is to
1293 strictly keep track of injected and removed files through all involved
1294 processes and call C<update> on every file system event. The other
1295 option is to let data come in and use the assistance of inotify. PAUSE
1296 is running the former, the cpan master site is running the latter.
1297 Both work equally well for CPAN because CPAN has not yet had any
1298 problem with upload storms. On installations that have to deal with
1299 more uploaded data than inotify+rrr can handle it's better to use the
1300 traditional method such that the relevant processes can build up some
1301 backpressure to throttle writing processes until we're ready to accept
1302 the next data chunk.
1304 =head1 FUTURE DIRECTIONS
1306 Convince other users outside the CPAN like
1307 http://fedoraproject.org/wiki/Infrastructure/Mirroring
1309 =head1 SEE ALSO
1311 L<File::Rsync::Mirror::Recentfile>,
1312 L<File::Rsync::Mirror::Recentfile::Done>,
1313 L<File::Rsync::Mirror::Recentfile::FakeBigFloat>
1315 =head1 BUGS
1317 Please report any bugs or feature requests through the web interface
1319 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=File-Rsync-Mirror-Recent>.
1320 I will be notified, and then you'll automatically be notified of
1321 progress on your bug as I make changes.
1323 =head1 SUPPORT
1325 You can find documentation for this module with the perldoc command.
1327 perldoc File::Rsync::Mirror::Recent
1329 You can also look for information at:
1331 =over 4
1333 =item * RT: CPAN's request tracker
1335 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=File-Rsync-Mirror-Recent>
1337 =item * AnnoCPAN: Annotated CPAN documentation
1339 L<http://annocpan.org/dist/File-Rsync-Mirror-Recent>
1341 =item * CPAN Ratings
1343 L<http://cpanratings.perl.org/d/File-Rsync-Mirror-Recent>
1345 =item * Search CPAN
1347 L<http://search.cpan.org/dist/File-Rsync-Mirror-Recent>
1349 =back
1352 =head1 ACKNOWLEDGEMENTS
1354 Thanks to RJBS for module-starter.
1356 =head1 AUTHOR
1358 Andreas König
1360 =head1 COPYRIGHT & LICENSE
1362 Copyright 2008, 2009 Andreas König.
1364 This program is free software; you can redistribute it and/or modify it
1365 under the same terms as Perl itself.
1368 =cut
1370 1; # End of File::Rsync::Mirror::Recent