this version did the cleanup but the children did not exit
[rersyncrecent.git] / lib / File / Rsync / Mirror / Recent.pm
blobaf0379bf0526b21cbb562bb3a3e1530cc027accc
1 package File::Rsync::Mirror::Recent;
3 # use warnings;
4 use strict;
5 use File::Rsync::Mirror::Recentfile;
7 =encoding utf-8
9 =head1 NAME
11 File::Rsync::Mirror::Recent - mirroring via rsync made efficient
13 =cut
15 package File::Rsync::Mirror::Recent;
17 use File::Basename qw(basename dirname fileparse);
18 use File::Copy qw(cp);
19 use File::Path qw(mkpath);
20 use File::Rsync;
21 use File::Rsync::Mirror::Recentfile::Done (); # at least needed by thaw()
22 use File::Rsync::Mirror::Recentfile::FakeBigFloat qw(:all);
23 use File::Temp;
24 use List::Pairwise qw(mapp grepp);
25 use List::Util qw(first max);
26 use Scalar::Util qw(blessed reftype);
27 use Storable;
28 use Time::HiRes qw();
29 use YAML::Syck;
31 use version; our $VERSION = qv('0.0.8');
33 =head1 SYNOPSIS
35 B<!!!! PRE-ALPHA ALERT !!!!>
37 Nothing in here is believed to be stable, nothing yet intended for
38 public consumption. The plan is to provide scripts that act as
39 frontends for all the backend functionality. Option and method names
40 may still change.
42 For the rationale see the section BACKGROUND.
44 The documentation in here is normally not needed because the code is
45 meant to be run from several standalone programs. For a quick
46 overview, see the file README.mirrorcpan and the bin/ directory of the
47 distribution. For the architectural ideas see the section THE
48 ARCHITECTURE OF A COLLECTION OF RECENTFILES below.
50 File::Rsync::Mirror::Recent establishes a view on a collection of
51 File::Rsync::Mirror::Recentfile objects and provides abstractions
52 spanning multiple time intervals associated with those.
54 =head1 EXPORT
56 No exports.
58 =head1 CONSTRUCTORS
60 =head2 my $obj = CLASS->new(%hash)
62 Constructor. On every argument pair the key is a method name and the
63 value is an argument to that method name.
65 =cut
67 sub new {
68 my($class, @args) = @_;
69 my $self = bless {}, $class;
70 while (@args) {
71 my($method,$arg) = splice @args, 0, 2;
72 $self->$method($arg);
74 return $self;
77 =head2 my $obj = CLASS->thaw($statusfile)
79 Constructor from a statusfile left over from a previous
80 rmirror run. See also C<runstatusfile>.
82 =cut
84 sub _thaw_if_small_enough {
85 my($self,$file) = @_;
86 return if -s $file > 100_000; # XXX should read and look how
87 # many lines we have for
88 # "reduced_self.-__pathdb"?
89 return $self->thaw($file);
91 sub thaw {
92 my($self, $file) = @_;
93 die "thaw called without statusfile argument" unless defined $file;
94 die "Alert: statusfile '$file' not found" unless -e $file;
95 require YAML::Syck;
96 my $start = time;
97 my $sleeptime = 0.02;
98 while (not mkdir "$file.lock") {
99 my $err = $!;
100 Time::HiRes::sleep $sleeptime;
101 my $waiting = time - $start;
102 if ($waiting >= 3){
103 warn "*** waiting ($waiting) for lock ($err) ***";
104 $sleeptime = 1;
107 my $size = -s $file;
108 my $serialized = YAML::Syck::LoadFile($file);
109 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
110 warn sprintf "DEBUG: Process $$ reading '$file' (size=$size). It was written %d seconds ago", time-$serialized->{time};
111 my $charged_self = $serialized->{reduced_self};
112 my $class = blessed $self;
113 bless $charged_self, $class;
114 my $rfs = $serialized->{reduced_rfs};
115 my $rfclass = $class . "file"; # "Recent" . "file"
116 my $pathdb = $charged_self->_pathdb;
117 for my $rf (@$rfs) {
118 bless $rf, $rfclass;
119 $rf->_pathdb($pathdb);
121 $charged_self->_recentfiles($rfs);
122 $charged_self->_principal_recentfile($rfs->[0]);
123 # die "FIXME: thaw all recentfiles from reduced_rfs into _recentfiles as well, watch out for pathdb and rsync";
124 return $charged_self;
127 =head1 ACCESSORS
129 =cut
131 my @accessors;
133 BEGIN {
134 @accessors =
136 "__pathdb",
137 "_dirtymark", # keeps track of the dirtymark of the recentfiles
138 "_logfilefordone", # turns on _logfile on all DONE
139 # systems (disk intensive)
140 "_max_one_state", # when we have no time left but want
141 # at least get one file per
142 # iteration to avoid procrastination
143 "_principal_recentfile",
144 "_recentfiles",
145 "_rsync",
146 "_runstatusfile", # occasionally dumps all rfs
147 "_verbose", # internal variable for verbose setter/getter
148 "_verboselog", # internal variable for verboselog setter/getter
151 my @pod_lines =
152 split /\n/, <<'=cut'; push @accessors, grep {s/^=item\s+//} @pod_lines; }
154 =over 4
156 =item ignore_link_stat_errors
158 as in F:R:M:Recentfile
160 =item local
162 Option to specify the local principal file for operations with a local
163 collection of recentfiles.
165 =item localroot
167 as in F:R:M:Recentfile
169 =item max_files_per_connection
171 as in F:R:M:Recentfile
173 =item remote
175 The remote principal recentfile in rsync notation. E.g.
177 pause.perl.org::authors/RECENT.recent
179 =item remoteroot
181 as in F:R:M:Recentfile
183 =item remote_recentfile
185 Rsync address of the remote C<RECENT.recent> symlink or whichever name
186 the principal remote recentfile has.
188 =item rsync_options
190 Things like compress, links, times or checksums. Passed in to the
191 File::Rsync object used to run the mirror.
193 =item tempdir
195 as in F:R:M:Recentfile
197 =item ttl
199 Minimum time before fetching the principal recentfile again.
201 =back
203 =cut
205 use accessors @accessors;
207 =head1 METHODS
209 =head2 $arrayref = $obj->news ( %options )
211 Test this with:
213 perl -Ilib bin/rrr-news \
214 -after 1217200539 \
215 -max 12 \
216 -local /home/ftp/pub/PAUSE/authors/RECENT.recent
218 perl -Ilib bin/rrr-news \
219 -after 1217200539 \
220 -rsync=compress=1 \
221 -rsync=links=1 \
222 -localroot /home/ftp/pub/PAUSE/authors/ \
223 -remote pause.perl.org::authors/RECENT.recent
224 -verbose
226 Note: all parameters that can be passed to recent_events can also be specified here.
228 Note: all data are kept in memory
230 =cut
232 sub news {
233 my($self, %opt) = @_;
234 my $local = $self->local;
235 unless ($local) {
236 if (my $remote = $self->remote) {
237 my $localroot;
238 if ($localroot = $self->localroot) {
239 # nice, they know what they are doing
240 } else {
241 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
243 } else {
244 die "Alert: neither local nor remote specified, cannot continue";
247 my $rfs = $self->recentfiles;
248 my $ret = [];
249 my $before;
250 for my $rf (@$rfs) {
251 my %locopt = %opt;
252 $locopt{before} = $before;
253 if ($opt{max}) {
254 $locopt{max} -= scalar @$ret;
255 last if $locopt{max} <= 0;
257 $locopt{info} = {};
258 my $res = $rf->recent_events(%locopt);
259 if (@$res){
260 push @$ret, @$res;
262 if ($opt{max} && scalar @$ret > $opt{max}) {
263 last;
265 if ($opt{after}){
266 if ( $locopt{info}{last} && _bigfloatlt($locopt{info}{last}{epoch},$opt{after}) ) {
267 last;
269 if ( _bigfloatgt($opt{after},$locopt{info}{first}{epoch}) ) {
270 last;
273 if (!@$res){
274 next;
276 $before = $res->[-1]{epoch};
277 $before = $opt{before} if $opt{before} && _bigfloatlt($opt{before},$before);
279 $ret;
282 =head2 overview ( %options )
284 returns a small table that summarizes the state of all recentfiles
285 collected in this Recent object.
287 $options{verbose}=1 increases the number of columns displayed.
289 Here is an example output:
291 Ival Cnt Max Min Span Util Cloud
292 1h 47 1225053014.38 1225049650.91 3363.47 93.4% ^ ^
293 6h 324 1225052939.66 1225033394.84 19544.82 90.5% ^ ^
294 1d 437 1225049651.53 1224966402.53 83248.99 96.4% ^ ^
295 1W 1585 1225039015.75 1224435339.46 603676.29 99.8% ^ ^
296 1M 5855 1225017376.65 1222428503.57 2588873.08 99.9% ^ ^
297 1Q 17066 1224578930.40 1216803512.90 7775417.50 100.0% ^ ^
298 1Y 15901 1223966162.56 1216766820.67 7199341.89 22.8% ^ ^
299 Z 9909 1223966162.56 1216766820.67 7199341.89 - ^ ^
301 I<Max> is the name of the interval.
303 I<Cnt> is the number of entries in this recentfile.
305 I<Max> is the highest(first) epoch in this recentfile, rounded.
307 I<Min> is the lowest(last) epoch in this recentfile, rounded.
309 I<Span> is the timespan currently covered, rounded.
311 I<Util> is I<Span> devided by the designated timespan of this
312 recentfile.
314 I<Cloud> is ascii art illustrating the sequence of the Max and Min
315 timestamps.
317 =cut
318 sub overview {
319 my($self,%options) = @_;
320 my $rfs = $self->recentfiles;
321 my(@s,%rank);
322 RECENTFILE: for my $rf (@$rfs) {
323 my $re=$rf->recent_events;
324 my $rfsummary;
325 if (@$re) {
326 my $span = $re->[0]{epoch}-$re->[-1]{epoch};
327 my $merged = $rf->merged;
328 $rfsummary =
330 "Ival",
331 $rf->interval,
332 "Cnt",
333 scalar @$re,
334 "Dirtymark",
335 $rf->dirtymark ? sprintf("%.2f",$rf->dirtymark) : "-",
336 "Produced",
337 sprintf ("%.2f", $rf->{ORIG}{Producers}{time}||0),
338 "Merged",
339 ($rf->interval eq "Z"
343 sprintf ("%.2f", $merged->{epoch} || 0)),
344 "Max",
345 sprintf ("%.2f", $re->[0]{epoch}),
346 "Min",
347 sprintf ("%.2f", $re->[-1]{epoch}),
348 "Span",
349 sprintf ("%.2f", $span),
350 "Util", # u9n:)
351 ($rf->interval eq "Z"
355 sprintf ("%5.1f%%", 100 * $span / $rf->interval_secs)
358 @rank{mapp {$b} grepp {$a =~ /^(Max|Min)$/} @$rfsummary} = ();
359 } else {
360 next RECENTFILE;
362 push @s, $rfsummary;
364 @rank{sort {$b <=> $a} keys %rank} = 1..keys %rank;
365 my $maxrank = max values %rank;
366 for my $rfsummary (@s) {
367 my $string = " " x $maxrank;
368 my @borders;
369 for my $ele (qw(Max Min)) {
370 my($r) = mapp {$b} grepp {$a eq $ele} @$rfsummary;
371 push @borders, $rank{$r}-1;
373 for ($borders[0],$borders[1]) {
374 substr($string,$_,1) = "^";
376 push @$rfsummary, "Cloud", $string;
378 unless ($options{verbose}) {
379 my %filter = map {($_=>1)} qw(Ival Cnt Max Min Span Util Cloud);
380 for (@s) {
381 $_ = [mapp {($a,$b)} grepp {!!$filter{$a}} @$_];
384 my @sprintf;
385 for (my $i = 0; $i <= $#{$s[0]}; $i+=2) {
386 my $maxlength = max ((map { length $_->[$i+1] } @s), length $s[0][$i]);
387 push @sprintf, "%" . $maxlength . "s";
389 my $sprintf = join " ", @sprintf;
390 $sprintf .= "\n";
391 my $headline = sprintf $sprintf, mapp {$a} @{$s[0]};
392 join "", $headline, map { sprintf $sprintf, mapp {$b} @$_ } @s;
395 =head2 _pathdb
397 Keeping track of already handled files. Currently it is a hash, will
398 probably become a database with its own accessors.
400 =cut
402 sub _pathdb {
403 my($self, $set) = @_;
404 if ($set) {
405 $self->__pathdb ($set);
407 my $pathdb = $self->__pathdb;
408 unless (defined $pathdb) {
409 $self->__pathdb(+{});
411 return $self->__pathdb;
414 =head2 $recentfile = $obj->principal_recentfile ()
416 returns the principal recentfile object of this tree.
418 =cut
419 # mirrors the recentfile and instantiates the recentfile object
420 sub _principal_recentfile_fromremote {
421 my($self) = @_;
422 # get the remote recentfile
423 my $rrfile = $self->remote or die "Alert: cannot construct a recentfile object without the 'remote' attribute";
424 my $splitter = qr{(.+)/([^/]*)};
425 my($remoteroot,$rfilename) = $rrfile =~ $splitter;
426 $self->remoteroot($remoteroot);
427 my($abslfile, $fh);
428 if (!defined $rfilename) {
429 die "Alert: Cannot resolve '$rrfile', does not match $splitter";
430 } elsif (not length $rfilename or $rfilename eq "RECENT.recent") {
431 ($abslfile,$rfilename,$fh) = $self->_principal_recentfile_fromremote_resosymlink($rfilename);
433 my @need_args =
435 "ignore_link_stat_errors",
436 "localroot",
437 "max_files_per_connection",
438 "remoteroot",
439 "rsync_options",
440 "tempdir",
441 "ttl",
442 "verbose",
443 "verboselog",
445 my $rf0;
446 unless ($abslfile) {
447 $rf0 = File::Rsync::Mirror::Recentfile->new (map {($_ => $self->$_)} @need_args);
448 $rf0->split_rfilename($rfilename);
449 $abslfile = $rf0->get_remote_recentfile_as_tempfile ();
451 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ( $abslfile );
452 $rf0->_current_tempfile ( $abslfile );
453 $rf0->_current_tempfile_fh ( $fh );
454 $rf0->_use_tempfile (1);
455 for my $override (@need_args) {
456 $rf0->$override ( $self->$override );
458 $rf0->is_slave (1);
459 return $rf0;
461 sub principal_recentfile {
462 my($self) = @_;
463 my $rf0 = $self->_principal_recentfile;
464 return $rf0 if defined $rf0;
465 my $local = $self->local;
466 if ($local) {
467 $rf0 = File::Rsync::Mirror::Recentfile->new_from_file ($local);
468 } else {
469 if (my $remote = $self->remote) {
470 my $localroot;
471 if ($localroot = $self->localroot) {
472 # nice, they know what they are doing
473 } else {
474 die "FIXME: remote called without localroot should trigger File::Temp.... TBD, sorry";
476 $rf0 = $self->_principal_recentfile_fromremote;
477 } else {
478 die "Alert: neither local nor remote specified, cannot continue";
481 $self->_principal_recentfile($rf0);
482 return $rf0;
485 =head2 $recentfiles_arrayref = $obj->recentfiles ()
487 returns a reference to the complete list of recentfile objects that
488 describe this tree. No guarantee is given that the represented
489 recentfiles exist or have been read. They are just bare objects.
491 =cut
493 sub recentfiles {
494 my($self) = @_;
495 my $rfs = $self->_recentfiles;
496 return $rfs if defined $rfs;
497 my $rf0 = $self->principal_recentfile;
498 my $pathdb = $self->_pathdb;
499 $rf0->_pathdb ($pathdb);
500 my $aggregator = $rf0->aggregator;
501 my @rf = $rf0;
502 for my $agg (@$aggregator) {
503 my $nrf = $rf0->_sparse_clone;
504 $nrf->interval ( $agg );
505 $nrf->have_mirrored ( 0 );
506 $nrf->_pathdb ( $pathdb );
507 push @rf, $nrf;
509 $self->_recentfiles(\@rf);
510 return \@rf;
513 =head2 $success = $obj->rmirror ( %options )
515 Mirrors all recentfiles of the I<remote> address working through all
516 of them, mirroring their contents.
518 Test this with:
520 use File::Rsync::Mirror::Recent;
521 my $rrr = File::Rsync::Mirror::Recent->new(
522 ignore_link_stat_errors => 1,
523 localroot => "/home/ftp/pub/PAUSE/authors",
524 remote => "pause.perl.org::authors/RECENT.recent",
525 max_files_per_connection => 5000,
526 rsync_options => {
527 compress => 1,
528 links => 1,
529 times => 1,
530 checksum => 0,
532 verbose => 1,
533 _runstatusfile => "recent-rmirror-state.yml",
534 _logfilefordone => "recent-rmirror-donelog.log",
536 $rrr->rmirror ( "skip-deletes" => 1, loop => 1 );
538 Or try without the loop parameter and write the loop yourself:
540 use File::Rsync::Mirror::Recent;
541 my @rrr;
542 for my $t ("authors","modules"){
543 my $rrr = File::Rsync::Mirror::Recent->new(
544 ignore_link_stat_errors => 1,
545 localroot => "/home/ftp/pub/PAUSE/$t",
546 remote => "pause.perl.org::$t/RECENT.recent",
547 max_files_per_connection => 512,
548 rsync_options => {
549 compress => 1,
550 links => 1,
551 times => 1,
552 checksum => 0,
554 verbose => 1,
555 _runstatusfile => "recent-rmirror-state-$t.yml",
556 _logfilefordone => "recent-rmirror-donelog-$t.log",
557 ttl => 5,
559 push @rrr, $rrr;
561 while (){
562 for my $rrr (@rrr){
563 $rrr->rmirror ( "skip-deletes" => 1 );
565 warn "sleeping 23\n"; sleep 23;
569 =cut
570 # _alluptodate is unused but at least it worked last time I needed it,
571 # so let us keep it around
572 sub _alluptodate {
573 my($self) = @_;
574 my $sdm = $self->_dirtymark;
575 return unless defined $sdm;
576 for my $rf (@{$self->recentfiles}) {
577 return if $rf->seeded;
578 my $rfdm = $rf->dirtymark;
579 return unless defined $rfdm;
580 return unless $rfdm eq $sdm;
581 my $done = $rf->done;
582 return unless defined $done;
583 my $done_intervals = $done->_intervals;
584 return if !defined $done_intervals;
585 # nonono, may be more than one, only covered it must be:
586 # return if @$done_intervals > 1;
587 my $minmax = $rf->minmax;
588 return unless defined $minmax;
589 return unless $done->covered(@$minmax{qw(max min)});
591 # $DB::single++;
592 return 1;
594 sub _fullseed {
595 my($self) = @_;
596 for ( @{$self->recentfiles} ) { $_->seed(1) }
598 sub rmirror {
599 my($self, %options) = @_;
601 my $rfs = $self->recentfiles;
603 $self->principal_recentfile->seed;
604 my $_sigint = sub {
605 # XXX exit gracefully (reminder)
608 # XXX needs accessor: warning, if set too low, we do nothing but
609 # mirror the principal!
610 my $minimum_time_per_loop = 20;
612 if (my $logfile = $self->_logfilefordone) {
613 for my $i (0..$#$rfs) {
614 $rfs->[$i]->done->_logfile($logfile);
617 if (my $dirtymark = $self->principal_recentfile->dirtymark) {
618 my $mydm = $self->_dirtymark;
619 if (!defined $mydm){
620 $self->_dirtymark($dirtymark);
621 } elsif ($dirtymark ne $mydm) {
622 if ($self->verbose) {
623 my $fh;
624 if (my $vl = $self->verboselog) {
625 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
626 } else {
627 $fh = \*STDERR;
629 print $fh "NewDirtymark: old[$mydm] new[$dirtymark]\n";
631 $self->_dirtymark($dirtymark);
634 my $file = $self->runstatusfile;
635 $self->_rmirror_runstatusfile_write ($file, \%options);
636 $self->_rmirror_loop($minimum_time_per_loop,\%options);
639 sub _rmirror_loop {
640 my($self,$minimum_time_per_loop,$options) = @_;
641 LOOP: while () {
642 my $ttleave = time + $minimum_time_per_loop;
643 my $file = $self->runstatusfile;
644 my $child = $self->_thaw_if_small_enough ($file);
645 if (!$options->{loop} && $child && $child->recentfiles->[-1]->uptodate) {
646 warn "DEBUG: parent process[$$] about to leave loop";
647 last LOOP;
649 my $pid;
650 if ($options->{loop}) {
651 $pid = fork;
652 } else {
653 $pid = 0;
655 if (! defined $pid) {
656 } elsif ($pid) {
657 waitpid($pid,0);
658 } else {
659 $self = $child || $self->thaw ($file);
660 my $rfs = $self->recentfiles;
661 $self->principal_recentfile->seed;
662 RECENTFILE: for my $i (0..$#$rfs) {
663 my $rf = $rfs->[$i];
664 if (time > $ttleave) {
665 # Must make sure that one file can get fetched in any case
666 $self->_max_one_state(1);
668 if ($rf->seeded) {
669 $self->_rmirror_mirror ($i, $options);
670 } elsif ($rf->uptodate) {
671 if ($i < $#$rfs) {
672 $rfs->[$i+1]->done->merge($rf->done);
674 # no further seed necessary because "periodic" does it
675 next RECENTFILE;
677 WORKUNIT: while (time < $ttleave) {
678 if ($rf->uptodate) {
679 $self->_rmirror_sleep_per_connection ($i);
680 next RECENTFILE;
681 } else {
682 $self->_rmirror_mirror ($i, $options);
685 if ($self->_max_one_state) {
686 last RECENTFILE;
689 $self->_max_one_state(0);
690 $self->_rmirror_runstatusfile_write ($file, $options);
691 if ($rfs->[-1]->uptodate) {
692 $self->_rmirror_cleanup;
693 my $file = $self->runstatusfile;
694 $self->_rmirror_runstatusfile_write ($file, $options);
695 unless ($options->{loop}) {
696 warn "DEBUG: uptodate child process[$$] about to leave loop";
697 sleep 1.5;
698 last LOOP;
700 } elsif ($options->{loop}) {
701 warn "DEBUG: child process[$$] about to leave loop";
702 sleep 1.5;
703 last LOOP;
707 my $sleep = $ttleave - time;
708 if ($sleep > 0.01) {
709 $self->_rmirror_endofloop_sleep ($sleep);
710 } else {
711 # negative time not invented yet:)
716 sub _rmirror_mirror {
717 my($self, $i, $options) = @_;
718 my $rfs = $self->recentfiles;
719 my $rf = $rfs->[$i];
720 my %locopt = %$options;
721 if ($self->_max_one_state) {
722 $locopt{max} = 1;
724 $locopt{piecemeal} = 1;
725 $rf->mirror (%locopt);
726 if ($i==0) {
727 # we limit to 0 for the case that upstream is broken and has
728 # more than one timestamp (happened on PAUSE 200903)
729 if (my $dirtymark = $rf->dirtymark) {
730 my $mydm = $self->_dirtymark;
731 if (!defined $mydm or $dirtymark ne $mydm) {
732 $self->_dirtymark($dirtymark);
733 $self->_fullseed;
739 sub _rmirror_sleep_per_connection {
740 my($self, $i) = @_;
741 my $rfs = $self->recentfiles;
742 my $rf = $rfs->[$i];
743 my $sleep = $rf->sleep_per_connection;
744 $sleep = 0.42 unless defined $sleep;
745 Time::HiRes::sleep $sleep;
746 $rfs->[$i+1]->done->merge($rf->done) if $i < $#$rfs;
749 sub _rmirror_cleanup {
750 my($self) = @_;
751 my $pathdb = $self->_pathdb();
752 for my $k (keys %$pathdb) {
753 delete $pathdb->{$k};
755 my $rfs = $self->recentfiles;
756 for my $i (0..$#$rfs-1) {
757 my $thismerged = $rfs->[$i]->merged;
758 my $next = $rfs->[$i+1];
759 my $nextminmax = $next->minmax;
760 # warn "DEBUG: i[$i] nextminmaxmax[$nextminmax->{max}] thismergedepoch[$thismerged->{epoch}]";
761 if (not defined $thismerged->{epoch} or _bigfloatlt($nextminmax->{max},$thismerged->{epoch})){
762 $next->seed;
763 # warn sprintf "DEBUG: next iv %s seeded since next-minmax-max[$nextminmax->{max}]lt this-merged-epoch[$thismerged->{epoch}]\n", $next->interval;
768 =head2 $file = $obj->runstatusfile ($set)
770 Getter/setter for C<_runstatusfile> attribute. Defaults to a temporary
771 file created by C<File::Temp>. A status file is required for
772 C<rmirror> working. Since it may be interesting for debugging
773 purposes, you may want to specify a permanent file for this.
775 =cut
776 sub runstatusfile {
777 my($self,$set) = @_;
778 if (defined $set) {
779 $self->_runstatusfile ($set);
781 my $x = $self->_runstatusfile;
782 unless (defined $x) {
783 require File::Temp;
784 my $tfile = File::Temp->new
786 TEMPLATE => "Recent-XXXX",
787 TMPDIR => 1,
788 UNLINK => 0,
789 CLEANUP => 0,
790 SUFFIX => '.dat',
792 $self->_runstatusfile($tfile->filename);
794 return $self->_runstatusfile;
797 # unused code.... it was an oops, discovered the thaw() method too
798 # late, and starting writing this here....
799 sub _rmirror_runstatusfile_read {
800 my($self, $file) = @_;
802 require YAML::Syck;
803 my $start = time;
804 # XXX is locking useful here?
805 while (not mkdir "$file.lock") {
806 Time::HiRes::sleep 0.2;
807 warn "*** waiting for lock ***" if time - $start >= 3;
809 my $yml = YAML::Syck::LoadFile $file;
810 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
811 my $rself = $yml->{reduced_self};
812 my $rfs = $yml->{reduced_rfs};
813 # XXX bring them into self
816 sub _rmirror_runstatusfile_write {
817 my($self, $file, $options) = @_;
818 my $rself;
819 while (my($k,$v) = each %$self) {
820 next if $k =~ /^-(_principal_recentfile|_recentfiles)$/;
821 $rself->{$k} = $v;
823 my $rfs = $self->recentfiles;
824 my $rrfs;
825 for my $i (0..$#$rfs) {
826 my $rf = $rfs->[$i];
827 while (my($k,$v) = each %$rf) {
828 next if $k =~ /^-(_current_tempfile_fh|_pathdb|_rsync)$/;
829 $rrfs->[$i]{$k} = $rfs->[$i]{$k};
832 require YAML::Syck;
833 my $start = time;
834 while (not mkdir "$file.lock") {
835 Time::HiRes::sleep 0.15;
836 warn "*** waiting for lock ***" if time - $start >= 3;
838 YAML::Syck::DumpFile
840 "$file.new",
842 options => $options,
843 time => time,
844 reduced_rfs => $rrfs,
845 reduced_self => $rself,
847 rename "$file.new", $file or die "Could not rename: $!";
848 rmdir "$file.lock" or die "Could not rmdir lockfile: $!";
851 sub _rmirror_endofloop_sleep {
852 my($self, $sleep) = @_;
853 if ($self->verbose) {
854 my $fh;
855 if (my $vl = $self->verboselog) {
856 open $fh, ">>", $vl or die "Could not open >> '$vl': $!";
857 } else {
858 $fh = \*STDERR;
860 printf $fh
862 "Dorm %d (%s secs)\n",
863 time,
864 $sleep,
866 sleep $sleep;
870 # it returns two things: abslfile and rfilename. But the abslfile is
871 # undef when the rfilename ends in .recent. A weird interface, my
872 # friend.
873 sub _principal_recentfile_fromremote_resosymlink {
874 my($self, $rfilename) = @_;
875 $rfilename = "RECENT.recent" unless length $rfilename;
876 my $abslfile = undef;
877 my $fh;
878 if ($rfilename =~ /\.recent$/) {
879 # may be a file *or* a symlink,
880 ($abslfile,$fh) = $self->_fetch_as_tempfile ($rfilename);
881 while (-l $abslfile) {
882 my $symlink = readlink $abslfile;
883 if ($symlink =~ m|/|) {
884 die "FIXME: filenames containing '/' not supported, got '$symlink'";
886 my $localrfile = File::Spec->catfile($self->localroot, $rfilename);
887 if (-e $localrfile) {
888 my $old_symlink = readlink $localrfile;
889 if ($old_symlink eq $symlink) {
890 unlink $abslfile or die "Cannot unlink '$abslfile': $!";
891 } else {
892 unlink $localrfile; # may fail
893 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
895 } else {
896 rename $abslfile, $localrfile or die "Cannot rename to '$localrfile': $!";
898 ($abslfile,$fh) = $self->_fetch_as_tempfile ($symlink);
901 return ($abslfile, $rfilename, $fh);
904 # takes a basename, returns an absolute name, does not delete the
905 # file, throws the $fh away. Caller must rename or unlink
907 # XXX needs to activate the fh in the rf0 so that it is able to unlink
908 # the file. I would like that the file is used immediately by $rf0
909 sub _fetch_as_tempfile {
910 my($self, $rfile) = @_;
911 my($suffix) = $rfile =~ /(\.[^\.]+)$/;
912 $suffix = "" unless defined $suffix;
913 my $fh = File::Temp->new
914 (TEMPLATE => sprintf(".FRMRecent-%s-XXXX",
915 $rfile,
917 DIR => $self->tempdir || $self->localroot,
918 SUFFIX => $suffix,
919 UNLINK => 0,
921 my $rsync;
922 unless ($rsync = File::Rsync->new($self->rsync_options)) {
923 require Carp;
924 Carp::confess(YAML::Syck::Dump($self->rsync_options));
926 my $dst = $fh->filename;
927 local($ENV{LANG}) = "C";
928 $rsync->exec
930 src => join("/",$self->remoteroot,$rfile),
931 dst => $dst,
932 ) or die "Could not mirror '$rfile' to $fh\: ".join(" ",$rsync->err);
933 unless (-l $dst) {
934 my $mode = 0644;
935 chmod $mode, $dst or die "Could not chmod $mode '$dst': $!";
937 return($dst,$fh);
940 =head2 $verbose = $obj->verbose ( $set )
942 Getter/setter method to set verbosity for this F:R:M:Recent object and
943 all associated Recentfile objects.
945 =cut
946 sub verbose {
947 my($self,$set) = @_;
948 if (defined $set) {
949 for ( @{$self->recentfiles} ) { $_->verbose($set) }
950 $self->_verbose ($set);
952 my $x = $self->_verbose;
953 unless (defined $x) {
954 $x = 0;
955 $self->_verbose ($x);
957 return $x;
961 =head2 my $vl = $obj->verboselog ( $set )
963 Getter/setter method for the path to the logfile to write verbose
964 progress information to.
966 Note: This is a primitive stop gap solution to get simple verbose
967 logging working. The program still sends error messages to STDERR.
968 Switching to Log4perl or similar is probably the way to go. TBD.
970 =cut
971 sub verboselog {
972 my($self,$set) = @_;
973 if (defined $set) {
974 for ( @{$self->recentfiles} ) { $_->verboselog($set) }
975 $self->_verboselog ($set);
977 my $x = $self->_verboselog;
978 unless (defined $x) {
979 $x = 0;
980 $self->_verboselog ($x);
982 return $x;
985 =head1 THE ARCHITECTURE OF A COLLECTION OF RECENTFILES
987 The idea is that we want to have a short file that records really
988 recent changes. So that a fresh mirror can be kept fresh as long as
989 the connectivity is given. Then we want longer files that record the
990 history before. So when the mirror falls behind the update period
991 reflected in the shortest file, it can complement the list of recent
992 file events with the next one. And if this is not long enough we want
993 another one, again a bit longer. And we want one that completes the
994 history back to the oldest file. The index files do contain the
995 complete list of current files. The longer a period covered by an
996 index file is gone the less often the index file is updated. For
997 practical reasons adjacent files will often overlap a bit but this is
998 neither necessary nor enforced. That's the basic idea. The following
999 example represents a tree that has a few updates every day:
1001 RECENT.recent -> RECENT-1h.yaml
1002 RECENT-1h.yaml
1003 RECENT-6h.yaml
1004 RECENT-1d.yaml
1005 RECENT-1M.yaml
1006 RECENT-1W.yaml
1007 RECENT-1Q.yaml
1008 RECENT-1Y.yaml
1009 RECENT-Z.yaml
1011 The first file is the principal file, in so far it is the one that is
1012 written first after a filesystem change. Usually a symlink links to it
1013 with a filename that has the same filenameroot and the suffix
1014 C<.recent>. On systems that do not support symlinks there is a plain
1015 copy maintained instead.
1017 The last file, the Z file, contains the complementary files that are
1018 in none of the other files. It may contain C<delete> events but often
1019 C<delete> events are discarded at the transition to the Z file.
1021 =head2 THE INDIVIDUAL RECENTFILE
1023 A I<recentfile> consists of a hash that has two keys: C<meta> and
1024 C<recent>. The C<meta> part has metadata and the C<recent> part has a
1025 list of fileobjects.
1027 =head2 THE META PART
1029 Here we find things that are pretty much self explaining: all
1030 lowercase attributes are accessors and as such explained in the
1031 manpages. The uppercase attribute C<Producers> contains version
1032 information about involved software components. Nothing to worry about
1033 as I believe.
1035 =head2 THE RECENT PART
1037 This is the interesting part. Every entry refers to some filesystem
1038 change (with path, epoch, type).
1040 The I<epoch> value is the point in time when some change was
1041 I<registered> but can be set to arbitrary values. Do not be tempted to
1042 believe that the entry has a direct relation to something like
1043 modification time or change time on the filesystem level. They are not
1044 reflecting release dates. (If you want exact release dates: Barbie is
1045 providing a database of them. See
1046 http://use.perl.org/~barbie/journal/37907).
1048 All these entries can be devided into two types (denoted by the
1049 I<type> attribute): C<new>s and C<delete>s. Changes and creations are
1050 C<new>s. Deletes are C<delete>s.
1052 Besides an I<epoch> and a I<type> attribute we find a third one:
1053 I<path>. This path is relative to the directory we find the
1054 I<recentfile> in.
1056 The order of the entries in the I<recentfile> is by decreasing epoch
1057 attribute. These are unique floating point numbers. When the server
1058 has ntp running correctly, then the timestamps are usually reflecting
1059 a real epoch. If time is running backwards, we trump the system epoch
1060 with strictly monotonically increasing floating point timestamps and
1061 guarantee they are unique.
1063 =head1 CORRUPTION AND RECOVERY
1065 If the origin host breaks the promise to deliver consistent and
1066 complete I<recentfiles> then it must update its C<dirtymark> and all
1067 slaves must discard what they cosider the truth. In the worst case
1068 that something goes wrong despite the dirtymark mechanism the way back
1069 to sanity can always be achieved through traditional rsyncing between
1070 the hosts.
1072 =head1 BACKGROUND
1074 This is about speeding up rsync operation on large trees. Uses a small
1075 metadata cocktail and pull technology.
1077 rersyncrecent solves this problem with a couple of (usually 2-10)
1078 lightweight index files which cover different overlapping time
1079 intervals. The master writes these files and the clients/slaves can
1080 construct the full tree from the information contained in them. The
1081 most recent index file usually covers the last seconds or minutes or
1082 hours of the tree and depending on the needs, slaves can rsync every
1083 few seconds or minutes and then bring their trees in full sync.
1085 The rersyncrecent model was developed for CPAN but as it is both
1086 convenient and economic it is also a general purpose solution. I'm
1087 looking forward to see a CPAN backbone that is only a few seconds
1088 behind PAUSE.
1090 =head2 NON-COMPETITORS
1092 File::Mirror JWU/File-Mirror/File-Mirror-0.10.tar.gz only local trees
1093 Mirror::YAML ADAMK/Mirror-YAML-0.03.tar.gz some sort of inner circle
1094 Net::DownloadMirror KNORR/Net-DownloadMirror-0.04.tar.gz FTP sites and stuff
1095 Net::MirrorDir KNORR/Net-MirrorDir-0.05.tar.gz dito
1096 Net::UploadMirror KNORR/Net-UploadMirror-0.06.tar.gz dito
1097 Pushmi::Mirror CLKAO/Pushmi-v1.0.0.tar.gz something SVK
1099 rsnapshot www.rsnapshot.org focus on backup
1100 csync www.csync.org more like unison
1101 multi-rsync sourceforge 167893 lan push to many
1102 chasm chasmd.org per-directory manifests
1104 =head2 COMPETITORS
1106 The problem to solve which clusters and ftp mirrors and otherwise
1107 replicated datasets like CPAN share: how to transfer only a minimum
1108 amount of data to determine the diff between two hosts.
1110 Normally it takes a long time to determine the diff itself before it
1111 can be transferred. Known solutions at the time of this writing are
1112 csync2, and rsync 3 batch mode.
1114 For many years the best solution was B<csync2> which solves the
1115 problem by maintaining a sqlite database on both ends and talking a
1116 highly sophisticated protocol to quickly determine which files to send
1117 and which to delete at any given point in time. Csync2 is often
1118 inconvenient because it is push technology and the act of syncing
1119 demands quite an intimate relationship between the sender and the
1120 receiver. This is hard to achieve in an environment of loosely coupled
1121 sites where the number of sites is large or connections are unreliable
1122 or network topology is changing.
1124 B<Rsync 3 batch mode> works around these problems by providing
1125 rsync-able batch files which allow receiving nodes to replay the
1126 history of the other nodes. This reduces the need to have an
1127 incestuous relation but it has the disadvantage that these batch files
1128 replicate the contents of the involved files. This seems inappropriate
1129 when the nodes already have a means of communicating over rsync.
1131 =head2 HONORABLE MENTION
1133 B<instantmirror> at https://fedorahosted.org/InstantMirror/ is an
1134 ambitious project that tries to combine various technologies (squid,
1135 bittorrent) to overcome the current slowness with the main focus on
1136 fedora. It's been founded in 2009-03 and at the time of this writing
1137 it is still a bit early to comment on.
1139 =head1 LIMITATIONS
1141 If the tree of the master server is changing faster than the bandwidth
1142 permits to mirror then additional protocols may need to be deployed.
1143 Certainly p2p/bittorrent can help in such situations because
1144 downloading sites help each other and bittorrent chunks large files
1145 into pieces.
1147 =head1 FUTURE DIRECTIONS
1149 Currently the origin server must keep track of injected and removed
1150 files. Should be supported by an inotify-based assistant.
1152 Convince other users outside the CPAN like
1153 http://fedoraproject.org/wiki/Infrastructure/Mirroring
1155 =head1 SEE ALSO
1157 L<File::Rsync::Mirror::Recentfile>,
1158 L<File::Rsync::Mirror::Recentfile::Done>,
1159 L<File::Rsync::Mirror::Recentfile::FakeBigFloat>
1161 =head1 BUGS
1163 Please report any bugs or feature requests through the web interface
1165 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=File-Rsync-Mirror-Recent>.
1166 I will be notified, and then you'll automatically be notified of
1167 progress on your bug as I make changes.
1169 =head1 SUPPORT
1171 You can find documentation for this module with the perldoc command.
1173 perldoc File::Rsync::Mirror::Recent
1175 You can also look for information at:
1177 =over 4
1179 =item * RT: CPAN's request tracker
1181 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=File-Rsync-Mirror-Recent>
1183 =item * AnnoCPAN: Annotated CPAN documentation
1185 L<http://annocpan.org/dist/File-Rsync-Mirror-Recent>
1187 =item * CPAN Ratings
1189 L<http://cpanratings.perl.org/d/File-Rsync-Mirror-Recent>
1191 =item * Search CPAN
1193 L<http://search.cpan.org/dist/File-Rsync-Mirror-Recent>
1195 =back
1198 =head1 ACKNOWLEDGEMENTS
1200 Thanks to RJBS for module-starter.
1202 =head1 AUTHOR
1204 Andreas König
1206 =head1 COPYRIGHT & LICENSE
1208 Copyright 2008, 2009 Andreas König.
1210 This program is free software; you can redistribute it and/or modify it
1211 under the same terms as Perl itself.
1214 =cut
1216 1; # End of File::Rsync::Mirror::Recent