this version ran a full download cycle; bug being that state file does not shring...
[rersyncrecent.git] / lib / File / Rsync / Mirror / Recent.pm
blob3dc2763f7d871bbf7fcabfb9351d1c565b006b59
1 package File::Rsync::Mirror::Recent;
3 # use warnings;
4 use strict;
5 use File::Rsync::Mirror::Recentfile;
7 =encoding utf-8
9 =head1 NAME
11 File::Rsync::Mirror::Recent - mirroring via rsync made efficient
13 =cut
15 package File::Rsync::Mirror::Recent;
17 use File::Basename qw(basename dirname fileparse);
18 use File::Copy qw(cp);
19 use File::Path qw(mkpath);
20 use File::Rsync;
21 use File::Rsync::Mirror::Recentfile::Done (); # at least needed by thaw()
22 use File::Rsync::Mirror::Recentfile::FakeBigFloat qw(:all);
23 use File::Temp;
24 use List::Pairwise qw(mapp grepp);
25 use List::Util qw(first max);
26 use Scalar::Util qw(blessed reftype);
27 use Storable;
28 use Time::HiRes qw();
29 use YAML::Syck;
31 use version; our $VERSION = qv('0.0.8');
33 =head1 SYNOPSIS
35 B<!!!! PRE-ALPHA ALERT !!!!>
37 Nothing in here is believed to be stable, nothing yet intended for
38 public consumption. The plan is to provide scripts that act as
39 frontends for all the backend functionality. Option and method names
40 may still change.
42 For the rationale see the section BACKGROUND.
44 The documentation in here is normally not needed because the code is
45 meant to be run from several standalone programs. For a quick
46 overview, see the file README.mirrorcpan and the bin/ directory of the
47 distribution. For the architectural ideas see the section THE
48 ARCHITECTURE OF A COLLECTION OF RECENTFILES below.
50 File::Rsync::Mirror::Recent establishes a view on a collection of
51 File::Rsync::Mirror::Recentfile objects and provides abstractions
52 spanning multiple time intervals associated with those.
54 =head1 EXPORT
56 No exports.
58 =head1 CONSTRUCTORS
60 =head2 my $obj = CLASS->new(%hash)
62 Constructor. On every argument pair the key is a method name and the
63 value is an argument to that method name.
65 =cut
67 sub new {
68 my($class, @args) = @_;
69 my $self = bless {}, $class;
70 while (@args) {
71 my($method,$arg) = splice @args, 0, 2;
72 $self->$method($arg);
74 return $self;
77 =head2 my $obj = CLASS->thaw($statusfile)
79 Constructor from a statusfile left over from a previous
80 rmirror run. See also C<runstatusfile>.
82 =cut
84 sub _thaw_if_small_enough {
85 my($self,$file) = @_;
86 return if -s $file > 100_000; # XXX should read and look how
87 # many lines we have for
88 # "reduced_self.-__pathdb"?
89 return $self->thaw($file);
91 sub thaw {
92 my($self, $file) = @_;
93 die "thaw called without statusfile argument" unless defined $file;
94 die "Alert: statusfile '$file' not found" unless -e $file;
95 require YAML::Syck;
96 my $start = time;
97 my $sleeptime = 0.02;
98 while (not mkdir "$file.lock") {
99 my $err = $!;
100 Time::HiRes::sleep $sleeptime;
101 my $waiting = time - $start;
102 if ($waiting >= 3){
103 warn "*** waiting ($waiting) for lock ($err) ***";
104 $sleeptime = 1;
107 my $size = -s $file;
108 my $serialized = YAML::Syck::LoadFile($file);
109 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
110 warn sprintf "DEBUG: Process $$ reading '$file' (size=$size). It was written %d seconds ago", time-$serialized->{time};
111 my $charged_self = $serialized->{reduced_self};
112 my $class = blessed $self;
113 bless $charged_self, $class;
114 my $rfs = $serialized->{reduced_rfs};
115 my $rfclass = $class . "file"; # "Recent" . "file"
116 my $pathdb = $charged_self->_pathdb;
117 for my $rf (@$rfs) {
118 bless $rf, $rfclass;
119 $rf->_pathdb($pathdb);
121 $charged_self->_recentfiles($rfs);
122 $charged_self->_principal_recentfile($rfs->[0]);
123 # die "FIXME: thaw all recentfiles from reduced_rfs into _recentfiles as well, watch out for pathdb and rsync";
124 return $charged_self;
127 =head1 ACCESSORS
129 =cut
131 my @accessors;
133 BEGIN {
134 @accessors =
136 "__pathdb",
137 "_dirtymark", # keeps track of the dirtymark of the recentfiles
138 "_logfilefordone", # turns on _logfile on all DONE
139 # systems (disk intensive)
140 "_max_one_state", # when we have no time left but want
141 # at least get one file per
142 # iteration to avoid procrastination
143 "_principal_recentfile",
144 "_recentfiles",
145 "_rsync",
146 "_runstatusfile", # occasionally dumps all rfs
147 "_verbose", # internal variable for verbose setter/getter
148 "_verboselog", # internal variable for verboselog setter/getter
151 my @pod_lines =
152 split /\n/, <<'=cut'; push @accessors, grep {s/^=item\s+//} @pod_lines; }
154 =over 4
156 =item ignore_link_stat_errors
158 as in F:R:M:Recentfile
160 =item local
162 Option to specify the local principal file for operations with a local
163 collection of recentfiles.
165 =item localroot
167 as in F:R:M:Recentfile
169 =item max_files_per_connection
171 as in F:R:M:Recentfile
173 =item remote
175 The remote principal recentfile in rsync notation. E.g.
177 pause.perl.org::authors/RECENT.recent
179 =item remoteroot
181 as in F:R:M:Recentfile
183 =item remote_recentfile
185 Rsync address of the remote C<RECENT.recent> symlink or whichever name
186 the principal remote recentfile has.
188 =item rsync_options
190 Things like compress, links, times or checksums. Passed in to the
191 File::Rsync object used to run the mirror.
193 =item tempdir
195 as in F:R:M:Recentfile
197 =item ttl
199 Minimum time before fetching the principal recentfile again.
201 =back
203 =cut
205 use accessors @accessors;
207 =head1 METHODS
209 =head2 $arrayref = $obj->news ( %options )
211 Test this with:
213 perl -Ilib bin/rrr-news \
214 -after 1217200539 \
215 -max 12 \
216 -local /home/ftp/pub/PAUSE/authors/RECENT.recent
218 perl -Ilib bin/rrr-news \
219 -after 1217200539 \
220 -rsync=compress=1 \
221 -rsync=links=1 \
222 -localroot /home/ftp/pub/PAUSE/authors/ \
223 -remote pause.perl.org::authors/RECENT.recent
224 -verbose
226 Note: all parameters that can be passed to recent_events can also be specified here.
228 Note: all data are kept in memory
230 =cut
232 sub news {
233 my($self, %opt) = @_;
234 my $local = $self->local;
235 unless ($local) {
236 if (my $remote = $self->remote) {
237 my $localroot;
238 if ($localroot = $self->localroot) {
239 # nice, they know what they are doing
240 } else {
241 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
243 } else {
244 die "Alert: neither local nor remote specified, cannot continue";
247 my $rfs = $self->recentfiles;
248 my $ret = [];
249 my $before;
250 for my $rf (@$rfs) {
251 my %locopt = %opt;
252 $locopt{before} = $before;
253 if ($opt{max}) {
254 $locopt{max} -= scalar @$ret;
255 last if $locopt{max} <= 0;
257 $locopt{info} = {};
258 my $res = $rf->recent_events(%locopt);
259 if (@$res){
260 push @$ret, @$res;
262 if ($opt{max} && scalar @$ret > $opt{max}) {
263 last;
265 if ($opt{after}){
266 if ( $locopt{info}{last} && _bigfloatlt($locopt{info}{last}{epoch},$opt{after}) ) {
267 last;
269 if ( _bigfloatgt($opt{after},$locopt{info}{first}{epoch}) ) {
270 last;
273 if (!@$res){
274 next;
276 $before = $res->[-1]{epoch};
277 $before = $opt{before} if $opt{before} && _bigfloatlt($opt{before},$before);
279 $ret;
282 =head2 overview ( %options )
284 returns a small table that summarizes the state of all recentfiles
285 collected in this Recent object.
287 $options{verbose}=1 increases the number of columns displayed.
289 Here is an example output:
291 Ival Cnt Max Min Span Util Cloud
292 1h 47 1225053014.38 1225049650.91 3363.47 93.4% ^ ^
293 6h 324 1225052939.66 1225033394.84 19544.82 90.5% ^ ^
294 1d 437 1225049651.53 1224966402.53 83248.99 96.4% ^ ^
295 1W 1585 1225039015.75 1224435339.46 603676.29 99.8% ^ ^
296 1M 5855 1225017376.65 1222428503.57 2588873.08 99.9% ^ ^
297 1Q 17066 1224578930.40 1216803512.90 7775417.50 100.0% ^ ^
298 1Y 15901 1223966162.56 1216766820.67 7199341.89 22.8% ^ ^
299 Z 9909 1223966162.56 1216766820.67 7199341.89 - ^ ^
301 I<Max> is the name of the interval.
303 I<Cnt> is the number of entries in this recentfile.
305 I<Max> is the highest(first) epoch in this recentfile, rounded.
307 I<Min> is the lowest(last) epoch in this recentfile, rounded.
309 I<Span> is the timespan currently covered, rounded.
311 I<Util> is I<Span> devided by the designated timespan of this
312 recentfile.
314 I<Cloud> is ascii art illustrating the sequence of the Max and Min
315 timestamps.
317 =cut
318 sub overview {
319 my($self,%options) = @_;
320 my $rfs = $self->recentfiles;
321 my(@s,%rank);
322 RECENTFILE: for my $rf (@$rfs) {
323 my $re=$rf->recent_events;
324 my $rfsummary;
325 if (@$re) {
326 my $span = $re->[0]{epoch}-$re->[-1]{epoch};
327 my $merged = $rf->merged;
328 $rfsummary =
330 "Ival",
331 $rf->interval,
332 "Cnt",
333 scalar @$re,
334 "Dirtymark",
335 $rf->dirtymark ? sprintf("%.2f",$rf->dirtymark) : "-",
336 "Produced",
337 sprintf ("%.2f", $rf->{ORIG}{Producers}{time}||0),
338 "Merged",
339 ($rf->interval eq "Z"
343 sprintf ("%.2f", $merged->{epoch} || 0)),
344 "Max",
345 sprintf ("%.2f", $re->[0]{epoch}),
346 "Min",
347 sprintf ("%.2f", $re->[-1]{epoch}),
348 "Span",
349 sprintf ("%.2f", $span),
350 "Util", # u9n:)
351 ($rf->interval eq "Z"
355 sprintf ("%5.1f%%", 100 * $span / $rf->interval_secs)
358 @rank{mapp {$b} grepp {$a =~ /^(Max|Min)$/} @$rfsummary} = ();
359 } else {
360 next RECENTFILE;
362 push @s, $rfsummary;
364 @rank{sort {$b <=> $a} keys %rank} = 1..keys %rank;
365 my $maxrank = max values %rank;
366 for my $rfsummary (@s) {
367 my $string = " " x $maxrank;
368 my @borders;
369 for my $ele (qw(Max Min)) {
370 my($r) = mapp {$b} grepp {$a eq $ele} @$rfsummary;
371 push @borders, $rank{$r}-1;
373 for ($borders[0],$borders[1]) {
374 substr($string,$_,1) = "^";
376 push @$rfsummary, "Cloud", $string;
378 unless ($options{verbose}) {
379 my %filter = map {($_=>1)} qw(Ival Cnt Max Min Span Util Cloud);
380 for (@s) {
381 $_ = [mapp {($a,$b)} grepp {!!$filter{$a}} @$_];
384 my @sprintf;
385 for (my $i = 0; $i <= $#{$s[0]}; $i+=2) {
386 my $maxlength = max ((map { length $_->[$i+1] } @s), length $s[0][$i]);
387 push @sprintf, "%" . $maxlength . "s";
389 my $sprintf = join " ", @sprintf;
390 $sprintf .= "\n";
391 my $headline = sprintf $sprintf, mapp {$a} @{$s[0]};
392 join "", $headline, map { sprintf $sprintf, mapp {$b} @$_ } @s;
395 =head2 _pathdb
397 Keeping track of already handled files. Currently it is a hash, will
398 probably become a database with its own accessors.
400 =cut
402 sub _pathdb {
403 my($self, $set) = @_;
404 if ($set) {
405 $self->__pathdb ($set);
407 my $pathdb = $self->__pathdb;
408 unless (defined $pathdb) {
409 $self->__pathdb(+{});
411 return $self->__pathdb;
414 =head2 $recentfile = $obj->principal_recentfile ()
416 returns the principal recentfile object of this tree.
418 =cut
419 # mirrors the recentfile and instantiates the recentfile object
420 sub _principal_recentfile_fromremote {
421 my($self) = @_;
422 # get the remote recentfile
423 my $rrfile = $self->remote or die "Alert: cannot construct a recentfile object without the 'remote' attribute";
424 my $splitter = qr{(.+)/([^/]*)};
425 my($remoteroot,$rfilename) = $rrfile =~ $splitter;
426 $self->remoteroot($remoteroot);
427 my($abslfile, $fh);
428 if (!defined $rfilename) {
429 die "Alert: Cannot resolve '$rrfile', does not match $splitter";
430 } elsif (not length $rfilename or $rfilename eq "RECENT.recent") {
431 ($abslfile,$rfilename,$fh) = $self->_principal_recentfile_fromremote_resosymlink($rfilename);
433 my @need_args =
435 "ignore_link_stat_errors",
436 "localroot",
437 "max_files_per_connection",
438 "remoteroot",
439 "rsync_options",
440 "tempdir",
441 "ttl",
442 "verbose",
443 "verboselog",
445 my $rf0;
446 unless ($abslfile) {
447 $rf0 = File::Rsync::Mirror::Recentfile->new (map {($_ => $self->$_)} @need_args);
448 $rf0->split_rfilename($rfilename);
449 $abslfile = $rf0->get_remote_recentfile_as_tempfile ();
451 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ( $abslfile );
452 $rf0->_current_tempfile ( $abslfile );
453 $rf0->_current_tempfile_fh ( $fh );
454 $rf0->_use_tempfile (1);
455 for my $override (@need_args) {
456 $rf0->$override ( $self->$override );
458 $rf0->is_slave (1);
459 return $rf0;
461 sub principal_recentfile {
462 my($self) = @_;
463 my $rf0 = $self->_principal_recentfile;
464 return $rf0 if defined $rf0;
465 my $local = $self->local;
466 if ($local) {
467 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ($local);
468 } else {
469 if (my $remote = $self->remote) {
470 my $localroot;
471 if ($localroot = $self->localroot) {
472 # nice, they know what they are doing
473 } else {
474 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
476 $rf0 = $self->_principal_recentfile_fromremote;
477 } else {
478 die "Alert: neither local nor remote specified, cannot continue";
481 $self->_principal_recentfile($rf0);
482 return $rf0;
485 =head2 $recentfiles_arrayref = $obj->recentfiles ()
487 returns a reference to the complete list of recentfile objects that
488 describe this tree. No guarantee is given that the represented
489 recentfiles exist or have been read. They are just bare objects.
491 =cut
493 sub recentfiles {
494 my($self) = @_;
495 my $rfs = $self->_recentfiles;
496 return $rfs if defined $rfs;
497 my $rf0 = $self->principal_recentfile;
498 my $pathdb = $self->_pathdb;
499 $rf0->_pathdb ($pathdb);
500 my $aggregator = $rf0->aggregator;
501 my @rf = $rf0;
502 for my $agg (@$aggregator) {
503 my $nrf = $rf0->_sparse_clone;
504 $nrf->interval ( $agg );
505 $nrf->have_mirrored ( 0 );
506 $nrf->_pathdb ( $pathdb );
507 push @rf, $nrf;
509 $self->_recentfiles(\@rf);
510 return \@rf;
513 =head2 $success = $obj->rmirror ( %options )
515 Mirrors all recentfiles of the I<remote> address working through all
516 of them, mirroring their contents.
518 Test this with:
520 use File::Rsync::Mirror::Recent;
521 my $rrr = File::Rsync::Mirror::Recent->new(
522 ignore_link_stat_errors => 1,
523 localroot => "/home/ftp/pub/PAUSE/authors",
524 remote => "pause.perl.org::authors/RECENT.recent",
525 max_files_per_connection => 5000,
526 rsync_options => {
527 compress => 1,
528 links => 1,
529 times => 1,
530 checksum => 0,
532 verbose => 1,
533 _runstatusfile => "recent-rmirror-state.yml",
534 _logfilefordone => "recent-rmirror-donelog.log",
536 $rrr->rmirror ( "skip-deletes" => 1, loop => 1 );
538 Or try without the loop parameter and write the loop yourself:
540 use File::Rsync::Mirror::Recent;
541 my @rrr;
542 for my $t ("authors","modules"){
543 my $rrr = File::Rsync::Mirror::Recent->new(
544 ignore_link_stat_errors => 1,
545 localroot => "/home/ftp/pub/PAUSE/$t",
546 remote => "pause.perl.org::$t/RECENT.recent",
547 max_files_per_connection => 512,
548 rsync_options => {
549 compress => 1,
550 links => 1,
551 times => 1,
552 checksum => 0,
554 verbose => 1,
555 _runstatusfile => "recent-rmirror-state-$t.yml",
556 _logfilefordone => "recent-rmirror-donelog-$t.log",
557 ttl => 5,
559 push @rrr, $rrr;
561 while (){
562 for my $rrr (@rrr){
563 $rrr->rmirror ( "skip-deletes" => 1 );
565 warn "sleeping 23\n"; sleep 23;
569 =cut
570 # _alluptodate is unused but at least it worked last time I needed it,
571 # so let us keep it around
572 sub _alluptodate {
573 my($self) = @_;
574 my $sdm = $self->_dirtymark;
575 return unless defined $sdm;
576 for my $rf (@{$self->recentfiles}) {
577 return if $rf->seeded;
578 my $rfdm = $rf->dirtymark;
579 return unless defined $rfdm;
580 return unless $rfdm eq $sdm;
581 my $done = $rf->done;
582 return unless defined $done;
583 my $done_intervals = $done->_intervals;
584 return if !defined $done_intervals;
585 # nonono, may be more than one, only covered it must be:
586 # return if @$done_intervals > 1;
587 my $minmax = $rf->minmax;
588 return unless defined $minmax;
589 return unless $done->covered(@$minmax{qw(max min)});
591 # $DB::single++;
592 return 1;
594 sub _fullseed {
595 my($self) = @_;
596 for ( @{$self->recentfiles} ) { $_->seed(1) }
598 sub rmirror {
599 my($self, %options) = @_;
601 my $rfs = $self->recentfiles;
603 $self->principal_recentfile->seed;
604 my $_sigint = sub {
605 # XXX exit gracefully (reminder)
608 # XXX needs accessor: warning, if set too low, we do nothing but
609 # mirror the principal!
610 my $minimum_time_per_loop = 20;
612 if (my $logfile = $self->_logfilefordone) {
613 for my $i (0..$#$rfs) {
614 $rfs->[$i]->done->_logfile($logfile);
617 if (my $dirtymark = $self->principal_recentfile->dirtymark) {
618 my $mydm = $self->_dirtymark;
619 if (!defined $mydm){
620 $self->_dirtymark($dirtymark);
621 } elsif ($dirtymark ne $mydm) {
622 if ($self->verbose) {
623 my $fh;
624 if (my $vl = $self->verboselog) {
625 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
626 } else {
627 $fh = \*STDERR;
629 print $fh "NewDirtymark: old[$mydm] new[$dirtymark]\n";
631 $self->_dirtymark($dirtymark);
634 my $file = $self->runstatusfile;
635 $self->_rmirror_runstatusfile_write ($file, \%options);
636 $self->_rmirror_loop($minimum_time_per_loop,\%options);
639 sub _rmirror_loop {
640 my($self,$minimum_time_per_loop,$options) = @_;
641 LOOP: while () {
642 my $ttleave = time + $minimum_time_per_loop;
643 my $file = $self->runstatusfile;
644 my $child = $self->_thaw_if_small_enough ($file);
645 if ($child && $child->recentfiles->[-1]->uptodate) {
646 warn "DEBUG: parent process[$$] about to leave loop";
647 last LOOP;
649 my $pid;
650 if ($options->{loop}) {
651 $pid = fork;
652 } else {
653 $pid = 0;
655 if (! defined $pid) {
656 } elsif ($pid) {
657 waitpid($pid,0);
658 } else {
659 $self = $child || $self->thaw ($file);
660 my $rfs = $self->recentfiles;
661 $self->principal_recentfile->seed;
662 RECENTFILE: for my $i (0..$#$rfs) {
663 my $rf = $rfs->[$i];
664 if (time > $ttleave) {
665 # Must make sure that one file can get fetched in any case
666 $self->_max_one_state(1);
668 if ($rf->seeded) {
669 $self->_rmirror_mirror ($i, $options);
670 } elsif ($rf->uptodate) {
671 if ($i < $#$rfs) {
672 $rfs->[$i+1]->done->merge($rf->done);
674 # no further seed necessary because "periodic" does it
675 next RECENTFILE;
677 WORKUNIT: while (time < $ttleave) {
678 if ($rf->uptodate) {
679 $self->_rmirror_sleep_per_connection ($i);
680 next RECENTFILE;
681 } else {
682 $self->_rmirror_mirror ($i, $options);
685 if ($self->_max_one_state) {
686 last RECENTFILE;
689 $self->_max_one_state(0);
690 $self->_rmirror_runstatusfile_write ($file, $options);
691 if ($rfs->[-1]->uptodate) {
692 $self->_rmirror_cleanup;
693 my $file = $self->runstatusfile;
694 unless ($options->{loop}) {
695 warn "DEBUG: uptodate child process[$$] about to leave loop";
696 sleep 1.5;
697 last LOOP;
699 } elsif ($options->{loop}) {
700 warn "DEBUG: child process[$$] about to leave loop";
701 sleep 1.5;
702 last LOOP;
706 my $sleep = $ttleave - time;
707 if ($sleep > 0.01) {
708 $self->_rmirror_endofloop_sleep ($sleep);
709 } else {
710 # negative time not invented yet:)
715 sub _rmirror_mirror {
716 my($self, $i, $options) = @_;
717 my $rfs = $self->recentfiles;
718 my $rf = $rfs->[$i];
719 my %locopt = %$options;
720 if ($self->_max_one_state) {
721 $locopt{max} = 1;
723 $locopt{piecemeal} = 1;
724 $rf->mirror (%locopt);
725 if ($i==0) {
726 # we limit to 0 for the case that upstream is broken and has
727 # more than one timestamp (happened on PAUSE 200903)
728 if (my $dirtymark = $rf->dirtymark) {
729 my $mydm = $self->_dirtymark;
730 if (!defined $mydm or $dirtymark ne $mydm) {
731 $self->_dirtymark($dirtymark);
732 $self->_fullseed;
738 sub _rmirror_sleep_per_connection {
739 my($self, $i) = @_;
740 my $rfs = $self->recentfiles;
741 my $rf = $rfs->[$i];
742 my $sleep = $rf->sleep_per_connection;
743 $sleep = 0.42 unless defined $sleep;
744 Time::HiRes::sleep $sleep;
745 $rfs->[$i+1]->done->merge($rf->done) if $i < $#$rfs;
748 sub _rmirror_cleanup {
749 my($self) = @_;
750 my $pathdb = $self->_pathdb();
751 for my $k (keys %$pathdb) {
752 delete $pathdb->{$k};
754 my $rfs = $self->recentfiles;
755 for my $i (0..$#$rfs-1) {
756 my $thismerged = $rfs->[$i]->merged;
757 my $next = $rfs->[$i+1];
758 my $nextminmax = $next->minmax;
759 # warn "DEBUG: i[$i] nextminmaxmax[$nextminmax->{max}] thismergedepoch[$thismerged->{epoch}]";
760 if (not defined $thismerged->{epoch} or _bigfloatlt($nextminmax->{max},$thismerged->{epoch})){
761 $next->seed;
762 # warn sprintf "DEBUG: next iv %s seeded since next-minmax-max[$nextminmax->{max}]lt this-merged-epoch[$thismerged->{epoch}]\n", $next->interval;
767 =head2 $file = $obj->runstatusfile ($set)
769 Getter/setter for C<_runstatusfile> attribute. Defaults to a temporary
770 file created by C<File::Temp>. A status file is required for
771 C<rmirror> working. Since it may be interesting for debugging
772 purposes, you may want to specify a permanent file for this.
774 =cut
775 sub runstatusfile {
776 my($self,$set) = @_;
777 if (defined $set) {
778 $self->_runstatusfile ($set);
780 my $x = $self->_runstatusfile;
781 unless (defined $x) {
782 require File::Temp;
783 my $tfile = File::Temp->new
785 TEMPLATE => "Recent-XXXX",
786 TMPDIR => 1,
787 UNLINK => 0,
788 CLEANUP => 0,
789 SUFFIX => '.dat',
791 $self->_runstatusfile($tfile->filename);
793 return $self->_runstatusfile;
796 # unused code.... it was an oops, discovered the thaw() method too
797 # late, and starting writing this here....
798 sub _rmirror_runstatusfile_read {
799 my($self, $file) = @_;
801 require YAML::Syck;
802 my $start = time;
803 # XXX is locking useful here?
804 while (not mkdir "$file.lock") {
805 Time::HiRes::sleep 0.2;
806 warn "*** waiting for lock ***" if time - $start >= 3;
808 my $yml = YAML::Syck::LoadFile $file;
809 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
810 my $rself = $yml->{reduced_self};
811 my $rfs = $yml->{reduced_rfs};
812 # XXX bring them into self
815 sub _rmirror_runstatusfile_write {
816 my($self, $file, $options) = @_;
817 my $rself;
818 while (my($k,$v) = each %$self) {
819 next if $k =~ /^-(_principal_recentfile|_recentfiles)$/;
820 $rself->{$k} = $v;
822 my $rfs = $self->recentfiles;
823 my $rrfs;
824 for my $i (0..$#$rfs) {
825 my $rf = $rfs->[$i];
826 while (my($k,$v) = each %$rf) {
827 next if $k =~ /^-(_current_tempfile_fh|_pathdb|_rsync)$/;
828 $rrfs->[$i]{$k} = $rfs->[$i]{$k};
831 require YAML::Syck;
832 my $start = time;
833 while (not mkdir "$file.lock") {
834 Time::HiRes::sleep 0.02;
835 warn "*** waiting for lock ***" if time - $start >= 3;
837 YAML::Syck::DumpFile
839 "$file.new",
841 options => $options,
842 time => time,
843 reduced_rfs => $rrfs,
844 reduced_self => $rself,
846 rename "$file.new", $file or die "Could not rename: $!";
847 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
850 sub _rmirror_endofloop_sleep {
851 my($self, $sleep) = @_;
852 if ($self->verbose) {
853 my $fh;
854 if (my $vl = $self->verboselog) {
855 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
856 } else {
857 $fh = \*STDERR;
859 printf $fh
861 "Dorm %d (%s secs)\n",
862 time,
863 $sleep,
865 sleep $sleep;
869 # it returns two things: abslfile and rfilename. But the abslfile is
870 # undef when the rfilename ends in .recent. A weird interface, my
871 # friend.
872 sub _principal_recentfile_fromremote_resosymlink {
873 my($self, $rfilename) = @_;
874 $rfilename = "RECENT.recent" unless length $rfilename;
875 my $abslfile = undef;
876 my $fh;
877 if ($rfilename =~ /\.recent$/) {
878 # may be a file *or* a symlink,
879 ($abslfile,$fh) = $self->_fetch_as_tempfile ($rfilename);
880 while (-l $abslfile) {
881 my $symlink = readlink $abslfile;
882 if ($symlink =~ m|/|) {
883 die "FIXME: filenames containing '/' not supported, got '$symlink'";
885 my $localrfile = File::Spec->catfile($self->localroot, $rfilename);
886 if (-e $localrfile) {
887 my $old_symlink = readlink $localrfile;
888 if ($old_symlink eq $symlink) {
889 unlink $abslfile or die "Cannot unlink '$abslfile': $!";
890 } else {
891 unlink $localrfile; # may fail
892 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
894 } else {
895 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
897 ($abslfile,$fh) = $self->_fetch_as_tempfile ($symlink);
900 return ($abslfile, $rfilename, $fh);
903 # takes a basename, returns an absolute name, does not delete the
904 # file, throws the $fh away. Caller must rename or unlink
906 # XXX needs to activate the fh in the rf0 so that it is able to unlink
907 # the file. I would like that the file is used immediately by $rf0
908 sub _fetch_as_tempfile {
909 my($self, $rfile) = @_;
910 my($suffix) = $rfile =~ /(\.[^\.]+)$/;
911 $suffix = "" unless defined $suffix;
912 my $fh = File::Temp->new
913 (TEMPLATE => sprintf(".FRMRecent-%s-XXXX",
914 $rfile,
916 DIR => $self->tempdir || $self->localroot,
917 SUFFIX => $suffix,
918 UNLINK => 0,
920 my $rsync;
921 unless ($rsync = File::Rsync->new($self->rsync_options)) {
922 require Carp;
923 Carp::confess(YAML::Syck::Dump($self->rsync_options));
925 my $dst = $fh->filename;
926 local($ENV{LANG}) = "C";
927 $rsync->exec
929 src => join("/",$self->remoteroot,$rfile),
930 dst => $dst,
931 ) or die "Could not mirror '$rfile' to $fh\: ".join(" ",$rsync->err);
932 unless (-l $dst) {
933 my $mode = 0644;
934 chmod $mode, $dst or die "Could not chmod $mode '$dst': $!";
936 return($dst,$fh);
939 =head2 $verbose = $obj->verbose ( $set )
941 Getter/setter method to set verbosity for this F:R:M:Recent object and
942 all associated Recentfile objects.
944 =cut
945 sub verbose {
946 my($self,$set) = @_;
947 if (defined $set) {
948 for ( @{$self->recentfiles} ) { $_->verbose($set) }
949 $self->_verbose ($set);
951 my $x = $self->_verbose;
952 unless (defined $x) {
953 $x = 0;
954 $self->_verbose ($x);
956 return $x;
960 =head2 my $vl = $obj->verboselog ( $set )
962 Getter/setter method for the path to the logfile to write verbose
963 progress information to.
965 Note: This is a primitive stop gap solution to get simple verbose
966 logging working. The program still sends error messages to STDERR.
967 Switching to Log4perl or similar is probably the way to go. TBD.
969 =cut
970 sub verboselog {
971 my($self,$set) = @_;
972 if (defined $set) {
973 for ( @{$self->recentfiles} ) { $_->verboselog($set) }
974 $self->_verboselog ($set);
976 my $x = $self->_verboselog;
977 unless (defined $x) {
978 $x = 0;
979 $self->_verboselog ($x);
981 return $x;
984 =head1 THE ARCHITECTURE OF A COLLECTION OF RECENTFILES
986 The idea is that we want to have a short file that records really
987 recent changes. So that a fresh mirror can be kept fresh as long as
988 the connectivity is given. Then we want longer files that record the
989 history before. So when the mirror falls behind the update period
990 reflected in the shortest file, it can complement the list of recent
991 file events with the next one. And if this is not long enough we want
992 another one, again a bit longer. And we want one that completes the
993 history back to the oldest file. The index files do contain the
994 complete list of current files. The longer a period covered by an
995 index file is gone the less often the index file is updated. For
996 practical reasons adjacent files will often overlap a bit but this is
997 neither necessary nor enforced. That's the basic idea. The following
998 example represents a tree that has a few updates every day:
1000 RECENT.recent -> RECENT-1h.yaml
1001 RECENT-1h.yaml
1002 RECENT-6h.yaml
1003 RECENT-1d.yaml
1004 RECENT-1M.yaml
1005 RECENT-1W.yaml
1006 RECENT-1Q.yaml
1007 RECENT-1Y.yaml
1008 RECENT-Z.yaml
1010 The first file is the principal file, in so far it is the one that is
1011 written first after a filesystem change. Usually a symlink links to it
1012 with a filename that has the same filenameroot and the suffix
1013 C<.recent>. On systems that do not support symlinks there is a plain
1014 copy maintained instead.
1016 The last file, the Z file, contains the complementary files that are
1017 in none of the other files. It may contain C<delete> events but often
1018 C<delete> events are discarded at the transition to the Z file.
1020 =head2 THE INDIVIDUAL RECENTFILE
1022 A I<recentfile> consists of a hash that has two keys: C<meta> and
1023 C<recent>. The C<meta> part has metadata and the C<recent> part has a
1024 list of fileobjects.
1026 =head2 THE META PART
1028 Here we find things that are pretty much self explaining: all
1029 lowercase attributes are accessors and as such explained in the
1030 manpages. The uppercase attribute C<Producers> contains version
1031 information about involved software components. Nothing to worry about
1032 as I believe.
1034 =head2 THE RECENT PART
1036 This is the interesting part. Every entry refers to some filesystem
1037 change (with path, epoch, type).
1039 The I<epoch> value is the point in time when some change was
1040 I<registered> but can be set to arbitrary values. Do not be tempted to
1041 believe that the entry has a direct relation to something like
1042 modification time or change time on the filesystem level. They are not
1043 reflecting release dates. (If you want exact release dates: Barbie is
1044 providing a database of them. See
1045 http://use.perl.org/~barbie/journal/37907).
1047 All these entries can be devided into two types (denoted by the
1048 I<type> attribute): C<new>s and C<delete>s. Changes and creations are
1049 C<new>s. Deletes are C<delete>s.
1051 Besides an I<epoch> and a I<type> attribute we find a third one:
1052 I<path>. This path is relative to the directory we find the
1053 I<recentfile> in.
1055 The order of the entries in the I<recentfile> is by decreasing epoch
1056 attribute. These are unique floating point numbers. When the server
1057 has ntp running correctly, then the timestamps are usually reflecting
1058 a real epoch. If time is running backwards, we trump the system epoch
1059 with strictly monotonically increasing floating point timestamps and
1060 guarantee they are unique.
1062 =head1 CORRUPTION AND RECOVERY
1064 If the origin host breaks the promise to deliver consistent and
1065 complete I<recentfiles> then it must update its C<dirtymark> and all
1066 slaves must discard what they cosider the truth. In the worst case
1067 that something goes wrong despite the dirtymark mechanism the way back
1068 to sanity can always be achieved through traditional rsyncing between
1069 the hosts.
1071 =head1 BACKGROUND
1073 This is about speeding up rsync operation on large trees. Uses a small
1074 metadata cocktail and pull technology.
1076 rersyncrecent solves this problem with a couple of (usually 2-10)
1077 lightweight index files which cover different overlapping time
1078 intervals. The master writes these files and the clients/slaves can
1079 construct the full tree from the information contained in them. The
1080 most recent index file usually covers the last seconds or minutes or
1081 hours of the tree and depending on the needs, slaves can rsync every
1082 few seconds or minutes and then bring their trees in full sync.
1084 The rersyncrecent model was developed for CPAN but as it is both
1085 convenient and economic it is also a general purpose solution. I'm
1086 looking forward to see a CPAN backbone that is only a few seconds
1087 behind PAUSE.
1089 =head2 NON-COMPETITORS
1091 File::Mirror JWU/File-Mirror/File-Mirror-0.10.tar.gz only local trees
1092 Mirror::YAML ADAMK/Mirror-YAML-0.03.tar.gz some sort of inner circle
1093 Net::DownloadMirror KNORR/Net-DownloadMirror-0.04.tar.gz FTP sites and stuff
1094 Net::MirrorDir KNORR/Net-MirrorDir-0.05.tar.gz dito
1095 Net::UploadMirror KNORR/Net-UploadMirror-0.06.tar.gz dito
1096 Pushmi::Mirror CLKAO/Pushmi-v1.0.0.tar.gz something SVK
1098 rsnapshot www.rsnapshot.org focus on backup
1099 csync www.csync.org more like unison
1100 multi-rsync sourceforge 167893 lan push to many
1101 chasm chasmd.org per-directory manifests
1103 =head2 COMPETITORS
1105 The problem to solve which clusters and ftp mirrors and otherwise
1106 replicated datasets like CPAN share: how to transfer only a minimum
1107 amount of data to determine the diff between two hosts.
1109 Normally it takes a long time to determine the diff itself before it
1110 can be transferred. Known solutions at the time of this writing are
1111 csync2, and rsync 3 batch mode.
1113 For many years the best solution was B<csync2> which solves the
1114 problem by maintaining a sqlite database on both ends and talking a
1115 highly sophisticated protocol to quickly determine which files to send
1116 and which to delete at any given point in time. Csync2 is often
1117 inconvenient because it is push technology and the act of syncing
1118 demands quite an intimate relationship between the sender and the
1119 receiver. This is hard to achieve in an environment of loosely coupled
1120 sites where the number of sites is large or connections are unreliable
1121 or network topology is changing.
1123 B<Rsync 3 batch mode> works around these problems by providing
1124 rsync-able batch files which allow receiving nodes to replay the
1125 history of the other nodes. This reduces the need to have an
1126 incestuous relation but it has the disadvantage that these batch files
1127 replicate the contents of the involved files. This seems inappropriate
1128 when the nodes already have a means of communicating over rsync.
1130 =head2 HONORABLE MENTION
1132 B<instantmirror> at https://fedorahosted.org/InstantMirror/ is an
1133 ambitious project that tries to combine various technologies (squid,
1134 bittorrent) to overcome the current slowness with the main focus on
1135 fedora. It's been founded in 2009-03 and at the time of this writing
1136 it is still a bit early to comment on.
1138 =head1 LIMITATIONS
1140 If the tree of the master server is changing faster than the bandwidth
1141 permits to mirror then additional protocols may need to be deployed.
1142 Certainly p2p/bittorrent can help in such situations because
1143 downloading sites help each other and bittorrent chunks large files
1144 into pieces.
1146 =head1 FUTURE DIRECTIONS
1148 Currently the origin server must keep track of injected and removed
1149 files. Should be supported by an inotify-based assistant.
1151 Convince other users outside the CPAN like
1152 http://fedoraproject.org/wiki/Infrastructure/Mirroring
1154 =head1 SEE ALSO
1156 L<File::Rsync::Mirror::Recentfile>,
1157 L<File::Rsync::Mirror::Recentfile::Done>,
1158 L<File::Rsync::Mirror::Recentfile::FakeBigFloat>
1160 =head1 BUGS
1162 Please report any bugs or feature requests through the web interface
1164 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=File-Rsync-Mirror-Recent>.
1165 I will be notified, and then you'll automatically be notified of
1166 progress on your bug as I make changes.
1168 =head1 SUPPORT
1170 You can find documentation for this module with the perldoc command.
1172 perldoc File::Rsync::Mirror::Recent
1174 You can also look for information at:
1176 =over 4
1178 =item * RT: CPAN's request tracker
1180 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=File-Rsync-Mirror-Recent>
1182 =item * AnnoCPAN: Annotated CPAN documentation
1184 L<http://annocpan.org/dist/File-Rsync-Mirror-Recent>
1186 =item * CPAN Ratings
1188 L<http://cpanratings.perl.org/d/File-Rsync-Mirror-Recent>
1190 =item * Search CPAN
1192 L<http://search.cpan.org/dist/File-Rsync-Mirror-Recent>
1194 =back
1197 =head1 ACKNOWLEDGEMENTS
1199 Thanks to RJBS for module-starter.
1201 =head1 AUTHOR
1203 Andreas König
1205 =head1 COPYRIGHT & LICENSE
1207 Copyright 2008, 2009 Andreas König.
1209 This program is free software; you can redistribute it and/or modify it
1210 under the same terms as Perl itself.
1213 =cut
1215 1; # End of File::Rsync::Mirror::Recent