->done and piecemeal starts working
[rersyncrecent.git] / lib / File / Rsync / Mirror / Recentfile.pm
blobd5007ded258e87913f466dc2e818504c31e89ab5
1 package File::Rsync::Mirror::Recentfile;
3 # use warnings;
4 use strict;
6 =encoding utf-8
8 =head1 NAME
10 File::Rsync::Mirror::Recentfile - mirroring via rsync made efficient
12 =head1 VERSION
14 Version 0.0.1
16 =cut
18 my $HAVE = {};
19 for my $package (
20 "Data::Serializer",
21 "File::Rsync"
22 ) {
23 $HAVE->{$package} = eval qq{ require $package; };
25 use Config;
26 use File::Basename qw(dirname fileparse);
27 use File::Copy qw(cp);
28 use File::Path qw(mkpath);
29 use File::Temp;
30 use List::Util qw(first min);
31 use Scalar::Util qw(reftype);
32 use Storable;
33 use Time::HiRes qw();
34 use YAML::Syck;
36 use version; our $VERSION = qv('0.0.1');
39 use constant MAX_INT => ~0>>1; # anything better?
40 use constant DEFAULT_PROTOCOL => 1;
42 # cf. interval_secs
43 my %seconds;
45 # maybe subclass if this mapping is bad?
46 my %serializers;
48 =head1 SYNOPSIS
50 B<!!!! PRE-ALPHA ALERT !!!!>
52 Nothing in here is believed to be stable, nothing yet intended for
53 public consumption. The plan is to provide a script in one of the next
54 releases that acts as a frontend for all the backend functionality.
55 Option and method names will very likely change.
57 For the rationale see the section BACKGROUND.
59 This is published only for developers of the (yet to be named)
60 script(s).
62 Writer (of a single file):
64 use File::Rsync::Mirror::Recentfile;
65 my $fr = File::Rsync::Mirror::Recentfile->new
67 interval => q(6h),
68 filenameroot => "RECENT",
69 comment => "These 'RECENT' files are part of a test of a new CPAN mirroring concept. Please ignore them for now.",
70 localroot => "/home/ftp/pub/PAUSE/authors/",
71 aggregator => [qw(1d 1W 1M 1Q 1Y Z)],
73 $rf->update("/home/ftp/pub/PAUSE/authors/id/A/AN/ANDK/CPAN-1.92_63.tar.gz","new");
75 Reader/mirrorer:
77 my $rf = File::Rsync::Mirror::Recentfile->new
79 filenameroot => "RECENT",
80 ignore_link_stat_errors => 1,
81 interval => q(6h),
82 localroot => "/home/ftp/pub/PAUSE/authors",
83 remote_dir => "",
84 remote_host => "pause.perl.org",
85 remote_module => "authors",
86 rsync_options => {
87 compress => 1,
88 'rsync-path' => '/usr/bin/rsync',
89 links => 1,
90 times => 1,
91 'omit-dir-times' => 1,
92 checksum => 1,
94 verbose => 1,
96 $rf->mirror;
98 Aggregator (usually the writer):
100 my $rf = File::Rsync::Mirror::Recentfile->new_from_file ( $file );
101 $rf->aggregate;
103 =head1 EXPORT
105 No exports.
107 =head1 CONSTRUCTORS
109 =head2 my $obj = CLASS->new(%hash)
111 Constructor. On every argument pair the key is a method name and the
112 value is an argument to that method name.
114 If a recentfile for this resource already exists, metadata that are
115 not defined by the constructor will be fetched from there as soon as
116 it is being read by recent_events().
118 =cut
120 sub new {
121 my($class, @args) = @_;
122 my $self = bless {}, $class;
123 while (@args) {
124 my($method,$arg) = splice @args, 0, 2;
125 $self->$method($arg);
127 unless (defined $self->protocol) {
128 $self->protocol(DEFAULT_PROTOCOL);
130 unless (defined $self->filenameroot) {
131 $self->filenameroot("RECENT");
133 unless (defined $self->serializer_suffix) {
134 $self->serializer_suffix(".yaml");
136 return $self;
139 =head2 my $obj = CLASS->new_from_file($file)
141 Constructor. $file is a I<recentfile>.
143 =cut
145 sub new_from_file {
146 my($class, $file) = @_;
147 my $self = bless {}, $class;
148 $self->_rfile($file);
149 #?# $self->lock;
150 my $serialized = do { open my $fh, $file or die "Could not open '$file': $!";
151 local $/;
152 <$fh>;
154 # XXX: we can skip this step when the metadata are sufficient, but
155 # we cannot parse the file without some magic stuff about
156 # serialized formats
157 while (-l $file) {
158 my($name,$path) = fileparse $file;
159 my $symlink = readlink $file;
160 if ($symlink =~ m|/|) {
161 die "FIXME: filenames containing '/' not supported, got $symlink";
163 $file = File::Spec->catfile ( $path, $symlink );
165 my($name,$path,$suffix) = fileparse $file, keys %serializers;
166 $self->serializer_suffix($suffix);
167 $self->localroot($path);
168 die "Could not determine file format from suffix" unless $suffix;
169 my $deserialized;
170 if ($suffix eq ".yaml") {
171 require YAML::Syck;
172 $deserialized = YAML::Syck::LoadFile($file);
173 } elsif ($HAVE->{"Data::Serializer"}) {
174 my $serializer = Data::Serializer->new
175 ( serializer => $serializers{$suffix} );
176 $deserialized = $serializer->raw_deserialize($serialized);
177 } else {
178 die "Data::Serializer not installed, cannot proceed with suffix '$suffix'";
180 while (my($k,$v) = each %{$deserialized->{meta}}) {
181 next if $k ne lc $k; # "Producers"
182 $self->$k($v);
184 unless (defined $self->protocol) {
185 $self->protocol(DEFAULT_PROTOCOL);
187 return $self;
190 =head1 ACCESSORS
192 =cut
194 my @accessors;
196 BEGIN {
197 @accessors = (
198 "_current_tempfile",
199 "_current_tempfile_fh",
200 "_interval",
201 "_is_locked",
202 "_localroot",
203 "_remote_dir",
204 "_remoteroot",
205 "_rfile",
206 "_rsync",
207 "_use_tempfile",
210 my @pod_lines =
211 split /\n/, <<'=cut'; push @accessors, grep {s/^=item\s+//} @pod_lines; }
213 =over 4
215 =item aggregator
217 A list of interval specs that tell the aggregator which I<recentfile>s
218 are to be produced.
220 =item canonize
222 The name of a method to canonize the path before rsyncing. Only
223 supported value is C<naive_path_normalize>. Defaults to that.
225 =item comment
227 A comment about this tree and setup.
229 =item done
231 A reference to a File::Rsync::Mirror::Recentfile::Done object that
232 keeps track of rsync activities. Only used/needed when we are a
233 mirroring slave.
235 =item filenameroot
237 The (prefix of the) filename we use for this I<recentfile>. Defaults to
238 C<RECENT>.
240 =item have_mirrored
242 Timestamp remembering when we mirrored this recentfile the last time.
243 Only relevant for slaves.
245 =item ignore_link_stat_errors
247 If set to true, rsync errors are ignored that complain about link stat
248 errors. These seem to happen only when there are files missing at the
249 origin. In race conditions this can always happen, so it is
250 recommended to set this value to true.
252 =item is_slave
254 If set to true, this object will fetch a new recentfile from remote
255 when the timespan between the last mirror (see have_mirrored) and now
256 is too large (currently hardcoded arbitrary 420 seconds).
258 =item locktimeout
260 After how many seconds shall we die if we cannot lock a I<recentfile>?
261 Defaults to 600 seconds.
263 =item loopinterval
265 When mirror_loop is called, this accessor can specify how much time
266 every loop shall at least take. If the work of a loop is done before
267 that time has gone, sleeps for the rest of the time. Defaults to
268 arbitrary 42 seconds.
270 =item max_files_per_connection
272 Maximum number of files that are transferred on a single rsync call.
273 Setting it higher means higher performance at the price of holding
274 connections longer and potentially disturbing other users in the pool.
275 Defaults to the arbitrary value 42.
277 =item max_rsync_errors
279 When rsync operations encounter that many errors without any resetting
280 success in between, then we die. Defaults to -1 which means we run
281 forever ignoring all rsync errors.
283 =item merged
285 Hashref denoting when this recentfile has been merged into some other
286 at which epoch.
288 =item minmax
290 Hashref remembering when we read the recent_events from this file the
291 last time and what the timespan was.
293 =item protocol
295 When the RECENT file format changes, we increment the protocol. We try
296 to support older protocols in later releases.
298 =item remote_host
300 The host we are mirroring from. Leave empty for the local filesystem.
302 =item remote_module
304 Rsync servers have so called modules to separate directory trees from
305 each other. Put here the name of the module under which we are
306 mirroring. Leave empty for local filesystem.
308 =item rsync_options
310 Things like compress, links, times or checksums. Passed in to the
311 File::Rsync object used to run the mirror.
313 =item serializer_suffix
315 Mostly untested accessor. The only well tested format for
316 I<recentfile>s at the moment is YAML. It is used with YAML::Syck via
317 Data::Serializer. But in principle other formats are supported as
318 well. See section SERIALIZERS below.
320 =item sleep_per_connection
322 Sleep that many seconds (floating point OK) after every chunk of rsyncing
323 has finished. Defaults to arbitrary 0.42.
325 =item ttl
327 Time to live. Number of seconds after which this recentfile must be
328 fetched again from the origin server. Only relevant for slaves.
329 Defaults to arbitrary 24.2 seconds.
331 =item verbose
333 Boolean to turn on a bit verbosity.
335 =back
337 =cut
339 use accessors @accessors;
341 =head1 METHODS
343 =head2 (void) $obj->aggregate
345 Takes all intervals that are collected in the accessor called
346 aggregator. Sorts them by actual length of the interval.
347 Removes those that are shorter than our own interval. Then merges this
348 object into the next larger object. The merging continues upwards
349 as long as the next I<recentfile> is old enough to warrant a merge.
351 If a merge is warranted is decided according to the interval of the
352 previous interval so that larger files are not so often updated as
353 smaller ones.
355 Here is an example to illustrate the behaviour. Given aggregators
357 1h 1d 1W 1M 1Q 1Y Z
359 then
361 1h updates 1d on every call to aggregate()
362 1d updates 1W earliest after 1h
363 1W updates 1M earliest after 1d
364 1M updates 1Q earliest after 1W
365 1Q updates 1Y earliest after 1M
366 1Y updates Z earliest after 1Q
368 Note that all but the smallest recentfile get updated at an arbitrary
369 rate and as such are quite useless on their own.
371 =cut
373 sub aggregate {
374 my($self) = @_;
375 my @aggs = sort { $a->{secs} <=> $b->{secs} }
376 grep { $_->{secs} >= $self->interval_secs }
377 map { { interval => $_, secs => $self->interval_secs($_)} }
378 $self->interval, @{$self->aggregator || []};
379 $aggs[0]{object} = $self;
380 AGGREGATOR: for my $i (0..$#aggs-1) {
381 my $this = $aggs[$i]{object};
382 my $next = Storable::dclone $this;
383 $next->interval($aggs[$i+1]{interval});
384 my $want_merge = 0;
385 if ($i == 0) {
386 $want_merge = 1;
387 } else {
388 my $next_rfile = $next->rfile;
389 if (-e $next_rfile) {
390 my $prev = $aggs[$i-1]{object};
391 local $^T = time;
392 my $next_age = 86400 * -M $next_rfile;
393 if ($next_age > $prev->interval_secs) {
394 $want_merge = 1;
396 } else {
397 $want_merge = 1;
400 if ($want_merge) {
401 $next->merge($this);
402 $aggs[$i+1]{object} = $next;
403 } else {
404 last AGGREGATOR;
409 sub _debug_aggregate {
410 my($self) = @_;
411 my @aggs = sort { $a->{secs} <=> $b->{secs} }
412 map { { interval => $_, secs => $self->interval_secs($_)} }
413 $self->interval, @{$self->aggregator || []};
414 my $report = [];
415 for my $i (0..$#aggs) {
416 my $this = Storable::dclone $self;
417 $this->interval($aggs[$i]{interval});
418 my $rfile = $this->rfile;
419 my @stat = stat $rfile;
420 push @$report, [$rfile, map {$stat[$_]||"undef"} 7,9];
422 $report;
425 # (void) $self->_assert_symlink()
426 sub _assert_symlink {
427 my($self) = @_;
428 my $recentrecentfile = File::Spec->catfile
430 $self->localroot,
431 sprintf
433 "%s.recent",
434 $self->filenameroot
437 if ($Config{d_symlink} eq "define") {
438 my $howto_create_symlink; # 0=no need; 1=straight symlink; 2=rename symlink
439 if (-l $recentrecentfile) {
440 my $found_symlink = readlink $recentrecentfile;
441 if ($found_symlink eq $self->rfilename) {
442 return;
443 } else {
444 $howto_create_symlink = 2;
446 } else {
447 $howto_create_symlink = 1;
449 if (1 == $howto_create_symlink) {
450 symlink $self->rfilename, $recentrecentfile or die "Could not create symlink '$recentrecentfile': $!"
451 } else {
452 unlink "$recentrecentfile.$$"; # may fail
453 symlink $self->rfilename, "$recentrecentfile.$$" or die "Could not create symlink '$recentrecentfile.$$': $!";
454 rename "$recentrecentfile.$$", $recentrecentfile or die "Could not rename '$recentrecentfile.$$' to $recentrecentfile: $!";
456 } else {
457 warn "Warning: symlinks not supported on this system, doing a copy instead\n";
458 unlink "$recentrecentfile.$$"; # may fail
459 cp $self->rfilename, "$recentrecentfile.$$" or die "Could not copy to '$recentrecentfile.$$': $!";
460 rename "$recentrecentfile.$$", $recentrecentfile or die "Could not rename '$recentrecentfile.$$' to $recentrecentfile: $!";
464 =head2 $success = $obj->full_mirror
466 (TBD) Mirrors the whole remote site, starting with the smallest I<recentfile>,
467 switching to larger ones ...
469 =cut
471 sub full_mirror {
472 my($self) = @_;
473 die "FIXME: Not yet implemented";
476 =head2 $tempfilename = $obj->get_remote_recentfile_as_tempfile ($rfilename)
478 =head2 $tempfilename = $obj->get_remote_recentfile_as_tempfile ()
480 Stores the remote I<recentfile> locally as a tempfile. $rfilename must
481 be a plain filename without path separators. The second form fetches
482 the file with the default name. The caller is responsible to remove
483 the file after use.
485 Note: if you're intending to act as an rsync server for other slaves,
486 then you must prefer this method to mirror (and read) recentfiles over
487 get_remotefile(). Otherwise downstream mirrors would expect you to
488 have files that you do not have yet.
490 =cut
492 sub get_remote_recentfile_as_tempfile {
493 my($self, $rfilename) = @_;
494 mkpath $self->localroot;
495 my $fh;
496 if ($rfilename) {
497 $self->_use_tempfile (1);
498 } elsif ( $self->_use_tempfile() ) {
499 return $self->_current_tempfile if ! $self->ttl_reached;
500 $fh = $self->_current_tempfile_fh;
501 $rfilename = $self->rfilename;
502 } else {
503 $rfilename = $self->rfilename;
505 die "Alert: illegal filename[$rfilename] contains a slash" if $rfilename =~ m|/|;
506 my $dst;
507 if ($fh) {
508 $dst = $self->_current_tempfile;
509 } else {
510 $fh = File::Temp->new
511 (TEMPLATE => sprintf(".%s-XXXX",
512 $rfilename,
514 DIR => $self->localroot,
515 SUFFIX => $self->serializer_suffix,
516 UNLINK => $self->_use_tempfile,
518 if ($self->_use_tempfile) {
519 $self->_current_tempfile_fh ($fh); # delay self destruction
521 $dst = $fh->filename;
522 $self->_current_tempfile ($dst);
523 my $rfile = eval { $self->rfile; }; # may fail (RECENT.recent has no rfile)
524 if (defined $rfile && -e $rfile) {
525 # saving on bandwidth. Might need to be configurable
526 # $self->bandwidth_is_cheap?
527 cp $rfile, $dst or die "Could not copy '$rfile' to '$dst': $!"
530 my $src = join ("/",
531 $self->remoteroot,
532 $rfilename,
534 if ($self->verbose) {
535 my $doing = -e $dst ? "Syncing" : "Getting";
536 printf STDERR
538 "%s (1/1) temporary %s ... ",
539 $doing,
540 $rfilename,
543 while (!$self->rsync->exec(
544 src => $src,
545 dst => $dst,
546 )) {
547 $self->register_rsync_error ($self->rsync->err);
549 $self->have_mirrored (Time::HiRes::time);
550 $self->un_register_rsync_error ();
551 if ($self->verbose) {
552 print STDERR "DONE\n";
554 my $mode = 0644;
555 chmod $mode, $dst or die "Could not chmod $mode '$dst': $!";
556 return $dst;
559 =head2 $localpath = $obj->get_remotefile ( $relative_path )
561 Rsyncs one single remote file to local filesystem.
563 Note: no locking is done on this file. Any number of processes may
564 mirror this object.
566 Note II: do not use for recentfiles. If you are a cascading
567 slave/server combination, it would confuse other slaves. They would
568 expect the contents of these recentfiles to be available. Use
569 get_remote_recentfile_as_tempfile() instead.
571 =cut
573 sub get_remotefile {
574 my($self, $path) = @_;
575 my $dst = File::Spec->catfile($self->localroot, $path);
576 mkpath dirname $dst;
577 if ($self->verbose) {
578 my $doing = -e $dst ? "Syncing" : "Getting";
579 printf STDERR
581 "%s (1/1) %s ... ",
582 $doing,
583 $path,
586 while (!$self->rsync->exec(
587 src => join("/",
588 $self->remoteroot,
589 $path),
590 dst => $dst,
591 )) {
592 $self->register_rsync_error ($self->rsync->err);
594 $self->un_register_rsync_error ();
595 if ($self->verbose) {
596 print STDERR "DONE\n";
598 return $dst;
601 =head2 $obj->interval ( $interval_spec )
603 Get/set accessor. $interval_spec is a string and described below in
604 the section INTERVAL SPEC.
606 =cut
608 sub interval {
609 my ($self, $interval) = @_;
610 if (@_ >= 2) {
611 $self->_interval($interval);
612 $self->_rfile(undef);
614 $interval = $self->_interval;
615 unless (defined $interval) {
616 # do not ask the $self too much, it recurses!
617 require Carp;
618 Carp::confess("Alert: interval undefined for '".$self."'. Cannot continue.");
620 return $interval;
623 =head2 $secs = $obj->interval_secs ( $interval_spec )
625 $interval_spec is described below in the section INTERVAL SPEC. If
626 empty defaults to the inherent interval for this object.
628 =cut
630 sub interval_secs {
631 my ($self, $interval) = @_;
632 $interval ||= $self->interval;
633 unless (defined $interval) {
634 die "interval_secs() called without argument on an object without a declared one";
636 my ($n,$t) = $interval =~ /^(\d*)([smhdWMQYZ]$)/ or
637 die "Could not determine seconds from interval[$interval]";
638 if ($interval eq "Z") {
639 return MAX_INT;
640 } elsif (exists $seconds{$t} and $n =~ /^\d+$/) {
641 return $seconds{$t}*$n;
642 } else {
643 die "Invalid interval specification: n[$n]t[$t]";
647 =head2 $obj->localroot ( $localroot )
649 Get/set accessor. The local root of the tree.
651 =cut
653 sub localroot {
654 my ($self, $localroot) = @_;
655 if (@_ >= 2) {
656 $self->_localroot($localroot);
657 $self->_rfile(undef);
659 $localroot = $self->_localroot;
662 =head2 $ret = $obj->local_path($path_found_in_recentfile)
664 Combines the path to our local mirror and the path of an object found
665 in this I<recentfile>. In other words: the target of a mirror operation.
667 Implementation note: We split on slashes and then use
668 File::Spec::catfile to adjust to the local operating system.
670 =cut
672 sub local_path {
673 my($self,$path) = @_;
674 unless (defined $path) {
675 # seems like a degenerated case
676 return $self->localroot;
678 my @p = split m|/|, $path;
679 File::Spec->catfile($self->localroot,@p);
682 =head2 (void) $obj->lock
684 Locking is implemented with an C<mkdir> on a locking directory
685 (C<.lock> appended to $rfile).
687 =cut
689 sub lock {
690 my ($self) = @_;
691 # not using flock because it locks on filehandles instead of
692 # old school ressources.
693 my $locked = $self->_is_locked and return;
694 my $rfile = $self->rfile;
695 # XXX need a way to allow breaking the lock
696 my $start = time;
697 my $locktimeout = $self->locktimeout || 600;
698 while (not mkdir "$rfile.lock") {
699 Time::HiRes::sleep 0.01;
700 if (time - $start > $locktimeout) {
701 die "Could not acquire lockdirectory '$rfile.lock': $!";
704 $self->_is_locked (1);
707 =head2 $ret = $obj->merge ($other)
709 Bulk update of this object with another one. It's intended (but not
710 enforced) to only merge smaller and younger $other objects into the
711 current one. If this file is a C<Z> file, then we do not merge in
712 objects of type C<delete>. But if we encounter an object of type
713 delete we delete the corresponding C<new> object.
715 =cut
717 sub merge {
718 my($self,$other) = @_;
719 $other->lock;
720 my $other_recent = $other->recent_events || [];
721 $self->lock;
722 my $my_recent = $self->recent_events || [];
724 # calculate the target time span
725 my $epoch = $other_recent->[0] ? $other_recent->[0]{epoch} : $my_recent->[0] ? $my_recent->[0]{epoch} : undef;
726 my $oldest_allowed = 0;
727 if ($epoch) {
728 if (my $merged = $self->merged) {
729 my $secs = $self->interval_secs();
730 $oldest_allowed = min($epoch - $secs, $merged->{epoch});
732 # throw away outsiders
733 while (@$my_recent && $my_recent->[-1]{epoch} < $oldest_allowed) {
734 pop @$my_recent;
738 my %have;
739 my $recent = [];
740 for my $ev (@$other_recent) {
741 my $epoch = $ev->{epoch} || 0;
742 next if $epoch < $oldest_allowed;
743 my $path = $ev->{path};
744 next if $have{$path}++;
745 if ( $self->interval eq "Z"
746 and $ev->{type} eq "delete") {
747 # do nothing
748 } else {
749 push @$recent, { epoch => $ev->{epoch}, path => $path, type => $ev->{type} };
752 push @$recent, grep { !$have{$_->{path}}++ } @$my_recent;
753 $self->write_recent($recent);
754 $self->unlock;
755 $other->merged({
756 time => Time::HiRes::time, # not used anywhere
757 epoch => $epoch, # used in oldest_allowed
758 into_interval => $self->interval, # not used anywhere
760 $other->write_recent($other_recent);
761 $other->unlock;
764 =head2 $hashref = $obj->meta_data
766 Returns the hashref of metadata that the server has to add to the
767 I<recentfile>.
769 =cut
771 sub meta_data {
772 my($self) = @_;
773 my $ret = $self->{meta};
774 for my $m (
775 "aggregator",
776 "canonize",
777 "comment",
778 "filenameroot",
779 "merged",
780 "interval",
781 "protocol",
782 "serializer_suffix",
784 my $v = $self->$m;
785 if (defined $v) {
786 $ret->{$m} = $v;
789 # XXX need to reset the Producer if I am a writer, keep it when I
790 # am a reader
791 $ret->{Producers} ||= {
792 __PACKAGE__, "$VERSION", # stringified it looks better
793 '$0', $0,
794 'time', Time::HiRes::time,
796 return $ret;
799 =head2 $success = $obj->mirror ( %options )
801 Mirrors the files in this I<recentfile> as reported by
802 C<recent_events>. Options named C<after>, C<before>, C<max>, and
803 C<skip-deletes> are passed through to the L<recent_events> call. The
804 boolean option C<piecemeal>, if true, causes C<mirror> to only rsync
805 C<max_files_per_connection> and keep track of the rsynced files so
806 that future calls will rsync different files until all files are
807 brought to sync.
809 =cut
811 sub mirror {
812 my($self, %options) = @_;
813 my $trecentfile = $self->get_remote_recentfile_as_tempfile();
814 $self->_use_tempfile (1);
815 my %passthrough = map { ($_ => $options{$_}) } qw(before after max skip-deletes);
816 my ($recent_events) = $self->recent_events(%passthrough);
817 my(@error, @collector, @icollector);
818 my $first_item = 0;
819 my $last_item = $#$recent_events;
820 my $done = $self->done;
821 if (!$done) {
822 require File::Rsync::Mirror::Recentfile::Done;
823 $done = File::Rsync::Mirror::Recentfile::Done->new();
824 $self->done ( $done );
826 ITEM: for my $i ($first_item..$last_item) {
827 my $recent_event = $recent_events->[$i];
828 next if $done->covered ( $recent_event->{epoch} );
829 my $dst = $self->local_path($recent_event->{path});
830 if ($recent_event->{type} eq "new"){
831 if ($self->verbose) {
832 my $doing = -e $dst ? "Syncing" : "Getting";
833 printf STDERR
835 "%s (%d/%d) %s ... ",
836 $doing,
837 1+$i,
838 1+$last_item,
839 $recent_event->{path},
842 my $max_files_per_connection = $self->max_files_per_connection || 42;
843 my $success;
844 if ($self->verbose) {
845 print STDERR "\n";
847 push @collector, $recent_event->{path};
848 push @icollector, $i;
849 if (@collector == $max_files_per_connection) {
850 $success = eval { $self->mirror_path(\@collector) };
851 @collector = ();
852 $done->register($recent_events, \@icollector);
853 @icollector = ();
854 my $sleep = $self->sleep_per_connection;
855 $sleep = 0.42 unless defined $sleep;
856 Time::HiRes::sleep $sleep;
857 } else {
858 next ITEM;
860 if (!$success || $@) {
861 warn "Warning: Error while mirroring: $@";
862 push @error, $@;
863 sleep 1;
865 if ($self->verbose) {
866 print STDERR "DONE\n";
868 } elsif ($recent_event->{type} eq "delete") {
869 if ($options{'skip-deletes'}) {
870 } else {
871 if (-l $dst or not -d _) {
872 unless (unlink $dst) {
873 require Carp;
874 Carp::cluck ( "Warning: Error while unlinking '$dst': $!" );
876 } else {
877 unless (rmdir $dst) {
878 require Carp;
879 Carp::cluck ( "Warning: Error on rmdir '$dst': $!" );
883 $done->register($recent_events, [$i]);
884 } else {
885 warn "Warning: invalid upload type '$recent_event->{type}'";
888 if (@collector) {
889 my $success = eval { $self->mirror_path(\@collector) };
890 @collector = ();
891 $done->register($recent_events, \@icollector);
892 @icollector = ();
893 if (!$success || $@) {
894 warn "Warning: Unknown error while mirroring: $@";
895 push @error, $@;
896 sleep 1;
898 if ($self->verbose) {
899 print STDERR "DONE\n";
902 my $rfile = $self->rfile;
903 unless (rename $trecentfile, $rfile) {
904 require Carp;
905 Carp::confess("Could not rename '$trecentfile' to '$rfile': $!");
907 $self->_use_tempfile (0);
908 if (my $ctfh = $self->_current_tempfile_fh) {
909 $ctfh->unlink_on_destroy (0);
910 $self->_current_tempfile_fh (undef);
912 return !@error;
915 =head2 (void) $obj->mirror_loop
917 Run mirror in an endless loop. See the accessor C<loopinterval>. XXX
918 What happens/should happen if we miss the interval during a single loop?
920 =cut
922 sub mirror_loop {
923 my($self) = @_;
924 my $iteration_start = time;
926 my $Signal = 0;
927 $SIG{INT} = sub { $Signal++ };
928 my $loopinterval = $self->loopinterval || 42;
929 my $after = -999999999;
930 LOOP: while () {
931 $self->mirror($after);
932 last LOOP if $Signal;
933 my $re = $self->recent_events;
934 $after = $re->[0]{epoch};
935 if ($self->verbose) {
936 local $| = 1;
937 print "($after)";
939 if (time - $iteration_start < $loopinterval) {
940 sleep $iteration_start + $loopinterval - time;
942 if ($self->verbose) {
943 local $| = 1;
944 print "~";
949 =head2 $success = $obj->mirror_path ( $arrref | $path )
951 If the argument is a scalar it is treated as a path. The remote path
952 is mirrored into the local copy. $path is the path found in the
953 I<recentfile>, i.e. it is relative to the root directory of the
954 mirror.
956 If the argument is an array reference then all elements are treated as
957 a path below the current tree and all are rsynced with a single
958 command (and a single connection).
960 =cut
962 sub mirror_path {
963 my($self,$path) = @_;
964 # XXX simplify the two branches such that $path is treated as
965 # [$path] maybe even demand the argument as an arrayref to
966 # simplify docs and code. (rsync-over-recentfile-2.pl uses the
967 # interface)
968 if (ref $path and ref $path eq "ARRAY") {
969 my $dst = $self->localroot;
970 mkpath dirname $dst;
971 my($fh) = File::Temp->new(TEMPLATE => sprintf(".%s-XXXX",
972 lc $self->filenameroot,
974 TMPDIR => 1,
975 UNLINK => 0,
977 for my $p (@$path) {
978 print $fh $p, "\n";
980 $fh->flush;
981 $fh->unlink_on_destroy(1);
982 while (!$self->rsync->exec
984 src => join("/",
985 $self->remoteroot,
987 dst => $dst,
988 'files-from' => $fh->filename,
989 )) {
990 my($err) = $self->rsync->err;
991 if ($self->ignore_link_stat_errors && $err =~ m{^ rsync: \s link_stat }x ) {
992 if ($self->verbose) {
993 warn "Info: ignoring link_stat error '$err'";
995 return 1;
997 $self->register_rsync_error ($err);
999 $self->un_register_rsync_error ();
1000 } else {
1001 my $dst = $self->local_path($path);
1002 mkpath dirname $dst;
1003 while (!$self->rsync->exec
1005 src => join("/",
1006 $self->remoteroot,
1007 $path
1009 dst => $dst,
1010 )) {
1011 my($err) = $self->rsync->err;
1012 if ($self->ignore_link_stat_errors && $err =~ m{^ rsync: \s link_stat }x ) {
1013 if ($self->verbose) {
1014 warn "Info: ignoring link_stat error '$err'";
1016 return 1;
1018 $self->register_rsync_error ($err);
1020 $self->un_register_rsync_error ();
1022 return 1;
1025 sub _my_current_rfile {
1026 my($self) = @_;
1027 my $rfile;
1028 if ($self->_use_tempfile) {
1029 $rfile = $self->_current_tempfile;
1030 } else {
1031 $rfile = $self->rfile;
1033 return $rfile;
1036 =head2 $path = $obj->naive_path_normalize ($path)
1038 Takes an absolute unix style path as argument and canonicalizes it to
1039 a shorter path if possible, removing things like double slashes or
1040 C</./> and removes references to C<../> directories to get a shorter
1041 unambiguos path. This is used to make the code easier that determines
1042 if a file passed to C<upgrade()> is indeed below our C<localroot>.
1044 =cut
1046 sub naive_path_normalize {
1047 my($self,$path) = @_;
1048 $path =~ s|/+|/|g;
1049 1 while $path =~ s|/[^/]+/\.\./|/|;
1050 $path =~ s|/$||;
1051 $path;
1054 =head2 $ret = $obj->read_recent_1 ( $data )
1056 Delegate of C<recent_events()> on protocol 1
1058 =cut
1060 sub read_recent_1 {
1061 my($self, $data) = @_;
1062 return $data->{recent};
1065 =head2 $array_ref = $obj->recent_events ( %options )
1067 Note: the code relies on the resource being written atomically. We
1068 cannot lock because we may have no write access. If the caller has
1069 write access (eg. aggregate() or update()), it has to care for any
1070 necessary locking.
1072 If $options{after} is specified, only file events after this timestamp
1073 are returned.
1075 If $options{before} is specified, only file events before this
1076 timestamp are returned.
1078 IF $options{'skip-deletes'} is specified, no files-to-be-deleted will
1079 be returned.
1081 If $options{max} is specified only this many events are returned.
1083 If $options{info} is specified, it must be a hashref. This hashref
1084 will be filled with metadata about the unfiltered recent_events of
1085 this object, in key C<first> there is the first item, in key C<last>
1086 is the last.
1088 =cut
1090 sub recent_events {
1091 my ($self, %options) = @_;
1092 my $info = $options{info};
1093 if ($self->is_slave
1094 and (!$self->have_mirrored || Time::HiRes::time-$self->have_mirrored>420)) {
1095 $self->get_remote_recentfile_as_tempfile;
1097 my $rfile_or_tempfile = $self->_my_current_rfile or return [];
1098 -e $rfile_or_tempfile or return [];
1099 my $suffix = $self->serializer_suffix;
1100 my ($data) = eval {
1101 if ($suffix eq ".yaml") {
1102 require YAML::Syck;
1103 YAML::Syck::LoadFile($rfile_or_tempfile);
1104 } elsif ($HAVE->{"Data::Serializer"}) {
1105 my $serializer = Data::Serializer->new
1106 ( serializer => $serializers{$suffix} );
1107 my $serialized = do
1109 open my $fh, $rfile_or_tempfile or die "Could not open: $!";
1110 local $/;
1111 <$fh>;
1113 $serializer->raw_deserialize($serialized);
1114 } else {
1115 die "Data::Serializer not installed, cannot proceed with suffix '$suffix'";
1118 my $err = $@;
1119 if ($err or !$data) {
1120 return [];
1122 my $re;
1123 if (reftype $data eq 'ARRAY') { # protocol 0
1124 $re = $data;
1125 } else {
1126 my $meth = sprintf "read_recent_%d", $data->{meta}{protocol};
1127 # we may be reading meta for the first time
1128 while (my($k,$v) = each %{$data->{meta}}) {
1129 next if $k ne lc $k; # "Producers"
1130 next if defined $self->$k;
1131 $self->$k($v);
1133 $re = $self->$meth ($data);
1134 my @stat = stat $rfile_or_tempfile or die "Cannot stat '$rfile_or_tempfile': $!";
1135 my $minmax = { mtime => $stat[9] };
1136 if (@$re) {
1137 $minmax->{min} = $re->[-1]{epoch};
1138 $minmax->{max} = $re->[0]{epoch};
1140 $self->minmax ( $minmax );
1142 return $re unless defined $options{after}; # XXX same for before and max
1143 my $last_item = $#$re;
1144 if ($info) {
1145 $info->{first} = $re->[0];
1146 $info->{last} = $re->[-1];
1148 if (defined $options{after}) {
1149 if ($re->[0]{epoch} > $options{after}) {
1150 if (
1151 my $f = first
1152 {$re->[$_]{epoch} <= $options{after}}
1153 0..$#$re
1155 $last_item = $f-1;
1157 } else {
1158 $last_item = -1;
1161 my $first_item = 0;
1162 if (defined $options{before}) {
1163 if ($re->[0]{epoch} > $options{before}) {
1164 if (
1165 my $f = first
1166 {$re->[$_]{epoch} < $options{before}}
1167 0..$last_item
1169 $first_item = $f;
1171 } else {
1172 $first_item = 0;
1175 my @rre = splice @$re, $first_item, 1+$last_item-$first_item;
1176 if ($options{'skip-deletes'}) {
1177 @rre = grep { $_->{type} ne "delete" } @rre;
1179 if ($options{max} && @rre > $options{max}) {
1180 @rre = splice @rre, 0, $options{max};
1182 \@rre;
1185 =head2 $ret = $obj->rfilename
1187 Just the basename of our I<recentfile>, composed from C<filenameroot>,
1188 a dash, C<interval>, and C<serializer_suffix>. E.g. C<RECENT-6h.yaml>
1190 =cut
1192 sub rfilename {
1193 my($self) = @_;
1194 my $file = sprintf("%s-%s%s",
1195 $self->filenameroot,
1196 $self->interval,
1197 $self->serializer_suffix,
1199 return $file;
1202 =head2 $str = $self->remote_dir
1204 The directory we are mirroring from.
1206 =cut
1208 sub remote_dir {
1209 my($self, $set) = @_;
1210 if (defined $set) {
1211 $self->_remote_dir ($set);
1213 my $x = $self->_remote_dir;
1214 $self->is_slave (1);
1215 return $x;
1218 =head2 $str = $obj->remoteroot
1220 =head2 (void) $obj->remoteroot ( $set )
1222 Get/Set the composed prefix needed when rsyncing from a remote module.
1223 If remote_host, remote_module, and remote_dir are set, it is composed
1224 from these.
1226 =cut
1228 sub remoteroot {
1229 my($self, $set) = @_;
1230 if (defined $set) {
1231 $self->_remoteroot($set);
1233 my $remoteroot = $self->_remoteroot;
1234 unless (defined $remoteroot) {
1235 $remoteroot = sprintf
1237 "%s%s%s",
1238 defined $self->remote_host ? ($self->remote_host."::") : "",
1239 defined $self->remote_module ? ($self->remote_module."/") : "",
1240 defined $self->remote_dir ? $self->remote_dir : "",
1242 $self->_remoteroot($remoteroot);
1244 return $remoteroot;
1247 =head2 my $rfile = $obj->rfile
1249 Returns the full path of the I<recentfile>
1251 =cut
1253 sub rfile {
1254 my($self) = @_;
1255 my $rfile = $self->_rfile;
1256 return $rfile if defined $rfile;
1257 $rfile = File::Spec->catfile
1258 ($self->localroot,
1259 $self->rfilename,
1261 $self->_rfile ($rfile);
1262 return $rfile;
1265 =head2 $rsync_obj = $obj->rsync
1267 The File::Rsync object that this object uses for communicating with an
1268 upstream server.
1270 =cut
1272 sub rsync {
1273 my($self) = @_;
1274 my $rsync = $self->_rsync;
1275 unless (defined $rsync) {
1276 my $rsync_options = $self->rsync_options || {};
1277 if ($HAVE->{"File::Rsync"}) {
1278 $rsync = File::Rsync->new($rsync_options);
1279 $self->_rsync($rsync);
1280 } else {
1281 die "File::Rsync required for rsync operations. Cannot continue";
1284 return $rsync;
1287 =head2 (void) $obj->register_rsync_error($err)
1289 =head2 (void) $obj->un_register_rsync_error()
1291 Register_rsync_error is called whenever the File::Rsync object fails
1292 on an exec (say, connection doesn't succeed). It issues a warning and
1293 sleeps for an increasing amount of time. Un_register_rsync_error
1294 resets the error count. See also accessor C<max_rsync_errors>.
1296 =cut
1299 my $no_success_count = 0;
1300 my $no_success_time = 0;
1301 sub register_rsync_error {
1302 my($self, $err) = @_;
1303 chomp $err;
1304 $no_success_time = time;
1305 $no_success_count++;
1306 my $max_rsync_errors = $self->max_rsync_errors;
1307 $max_rsync_errors = -1 unless defined $max_rsync_errors;
1308 if ($max_rsync_errors>=0 && $no_success_count >= $max_rsync_errors) {
1309 die sprintf
1311 "Alert: Error while rsyncing: '%s', error count: %d, exiting now,",
1312 $err,
1313 $no_success_count,
1316 my $sleep = 12 * $no_success_count;
1317 $sleep = 120 if $sleep > 120;
1318 warn sprintf
1320 "Warning: %s, Error while rsyncing: '%s', sleeping %d",
1321 scalar(localtime($no_success_time)),
1322 $err,
1323 $sleep,
1325 sleep $sleep
1327 sub un_register_rsync_error {
1328 my($self) = @_;
1329 $no_success_time = 0;
1330 $no_success_count = 0;
1334 =head2 $clone = $obj->_sparse_clone
1336 Clones just as much from itself that it does not hurt. Experimental method.
1338 =cut
1340 sub _sparse_clone {
1341 my($self) = @_;
1342 my $new = bless {}, ref $self;
1343 for my $m (qw(
1344 _interval
1345 _localroot
1346 _remoteroot
1347 _rfile
1348 _use_tempfile
1349 aggregator
1350 filenameroot
1351 is_slave
1352 protocol
1353 rsync_options
1354 serializer_suffix
1355 verbose
1356 )) {
1357 my $o = $self->$m;
1358 $o = Storable::dclone $o if ref $o;
1359 $new->$m($o);
1361 $new;
1364 =head2 $boolean = OBJ->ttl_reached ()
1366 =cut
1368 sub ttl_reached {
1369 my($self) = @_;
1370 my $have_mirrored = $self->have_mirrored;
1371 my $now = Time::HiRes::time;
1372 my $ttl = $self->ttl;
1373 $ttl = 24.2 unless defined $ttl;
1374 if ($now > $have_mirrored + $ttl) {
1375 return 1;
1377 return 0;
1380 =head2 (void) $obj->unlock()
1382 Unlocking is implemented with an C<rmdir> on a locking directory
1383 (C<.lock> appended to $rfile).
1385 =cut
1387 sub unlock {
1388 my($self) = @_;
1389 return unless $self->_is_locked;
1390 my $rfile = $self->rfile;
1391 rmdir "$rfile.lock";
1392 $self->_is_locked (0);
1395 =head2 $ret = $obj->update ($path, $type)
1397 Enter one file into the local I<recentfile>. $path is the (usually
1398 absolute) path. If the path is outside the I<our> tree, then it is
1399 ignored.
1401 $type is one of C<new> or C<delete>.
1403 =cut
1405 sub update {
1406 my($self,$path,$type) = @_;
1407 die "update called without path argument" unless defined $path;
1408 die "update called without type argument" unless defined $type;
1409 die "update called with illegal type argument: $type" unless $type =~ /(new|delete)/;
1410 my $canonmeth = $self->canonize;
1411 unless ($canonmeth) {
1412 $canonmeth = "naive_path_normalize";
1414 $path = $self->$canonmeth($path);
1415 my $lrd = $self->localroot;
1416 if ($path =~ s|^\Q$lrd\E||) {
1417 $path =~ s|^/||;
1418 my $interval = $self->interval;
1419 my $secs = $self->interval_secs();
1420 my $epoch = Time::HiRes::time;
1421 $self->lock;
1422 my $recent = $self->recent_events;
1423 $recent ||= [];
1424 my $oldest_allowed = 0;
1425 if (my $merged = $self->merged) {
1426 my $secs = $self->interval_secs();
1427 $oldest_allowed = min($epoch - $secs, $merged->{epoch});
1429 TRUNCATE: while (@$recent) {
1430 if ($recent->[-1]{epoch} < $oldest_allowed) {
1431 pop @$recent;
1432 } else {
1433 last TRUNCATE;
1436 # remove older duplicates of this $path, irrespective of $type:
1437 $recent = [ grep { $_->{path} ne $path } @$recent ];
1439 unshift @$recent, { epoch => $epoch, path => $path, type => $type };
1440 $self->write_recent($recent);
1441 $self->_assert_symlink;
1442 $self->unlock;
1446 =head2 uptodate
1448 True if this object has mirrored the complete interval covered by the
1449 current recentfile.
1451 *** WIP ***
1453 =cut
1455 sub uptodate {
1456 my($self) = @_;
1457 return 0 if $self->ttl_reached;
1459 # look if recentfile has unchanged timestamp
1460 my $minmax = $self->minmax;
1461 if (exists $minmax->{mtime}) {
1462 my $rfile = $self->_my_current_rfile;
1463 my @stat = stat $rfile;
1464 my $mtime = $stat[9];
1465 if ($mtime > $minmax->{mtime}) {
1466 return 0;
1467 } else {
1468 return $self->done->covered(@$minmax{qw(min max)});
1471 return 0;
1474 =head2 $obj->write_recent ($recent_files_arrayref)
1476 Writes a I<recentfile> based on the current reflection of the current
1477 state of the tree limited by the current interval.
1479 =cut
1481 sub write_recent {
1482 my ($self,$recent) = @_;
1483 die "write_recent called without argument" unless defined $recent;
1484 my $meth = sprintf "write_%d", $self->protocol;
1485 $self->$meth($recent);
1488 =head2 $obj->write_0 ($recent_files_arrayref)
1490 Delegate of C<write_recent()> on protocol 0
1492 =cut
1494 sub write_0 {
1495 my ($self,$recent) = @_;
1496 my $rfile = $self->rfile;
1497 YAML::Syck::DumpFile("$rfile.new",$recent);
1498 rename "$rfile.new", $rfile or die "Could not rename to '$rfile': $!";
1501 =head2 $obj->write_1 ($recent_files_arrayref)
1503 Delegate of C<write_recent()> on protocol 1
1505 =cut
1507 sub write_1 {
1508 my ($self,$recent) = @_;
1509 my $rfile = $self->rfile;
1510 my $suffix = $self->serializer_suffix;
1511 my $data = {
1512 meta => $self->meta_data,
1513 recent => $recent,
1515 my $serialized;
1516 if ($suffix eq ".yaml") {
1517 $serialized = YAML::Syck::Dump($data);
1518 } elsif ($HAVE->{"Data::Serializer"}) {
1519 my $serializer = Data::Serializer->new
1520 ( serializer => $serializers{$suffix} );
1521 $serialized = $serializer->raw_serialize($data);
1522 } else {
1523 die "Data::Serializer not installed, cannot proceed with suffix '$suffix'";
1525 open my $fh, ">", "$rfile.new" or die "Could not open >'$rfile.new': $!";
1526 print $fh $serialized;
1527 close $fh or die "Could not close '$rfile.new': $!";
1528 rename "$rfile.new", $rfile or die "Could not rename to '$rfile': $!";
1531 BEGIN {
1532 my @pod_lines =
1533 split /\n/, <<'=cut'; %serializers = map { eval } grep {s/^=item\s+C<<(.+)>>$/$1/} @pod_lines; }
1535 =head1 THE ARCHITECTURE OF A COLLECTION OF RECENTFILES
1537 The idea is that we want to have a short file that records really
1538 recent changes. So that a fresh mirror can be kept fresh as long as
1539 the connectivity is given. Then we want longer files that record the
1540 history before. So when the mirror falls behind the update period
1541 reflected in the shortest file, it can switch to the next one. And if
1542 this is not long enough we want another one, again a bit longer. And
1543 we want one that completes the history back to the oldest file. For
1544 practical reasons the timespans of these files must overlap a bit and
1545 to keep the bandwidth necessities low they must not be
1546 updated too frequently. That's the basic idea. The following
1547 example represents a tree that has a few updates every day:
1549 RECENT.recent -> RECENT-1h.yaml
1550 RECENT-6h.yaml
1551 RECENT-1d.yaml
1552 RECENT-1M.yaml
1553 RECENT-1W.yaml
1554 RECENT-1Q.yaml
1555 RECENT-1Y.yaml
1556 RECENT-Z.yaml
1558 The first file is the principal file, in so far it is the one that is
1559 written first after a filesystem change. Usually a symlink links to it
1560 with a filename that has the same filenameroot and the suffix
1561 C<.recent>. On systems that do not support symlinks there is a plain
1562 copy maintained instead.
1564 The last file, the Z file, contains the complementary files that are
1565 in none of the other files. It does never contain C<deletes>. Besides
1566 this it serves the role of a recovery mechanism or spill over pond.
1567 When things go wrong, it's a valuable controlling instance to hold the
1568 differences between the collection of limited interval files and the
1569 actual filesystem.
1571 =head2 A SINGLE RECENTFILE
1573 A I<recentfile> consists of a hash that has two keys: C<meta> and
1574 C<recent>. The C<meta> part has metadata and the C<recent> part has a
1575 list of fileobjects.
1577 =head2 THE META PART
1579 Here we find things that are pretty much self explaining: all
1580 lowercase attributes are accessors and as such explained somewhere
1581 above in this manpage. The uppercase attribute C<Producers> contains
1582 version information about involved software components. Nothing to
1583 worry about as I believe.
1585 =head2 THE RECENT PART
1587 This is the interesting part. Every entry refers to some filesystem
1588 change (with path, epoch, type). The epoch value is the point in time
1589 when some change was I<registered>. Do not be tempted to believe that
1590 the entry has a direct relation to something like modification time or
1591 change time on the filesystem level. The timestamp (I<epoch> element)
1592 is a floating point number and does practically never correspond
1593 exactly to the data recorded in the filesystem but rather to the time
1594 when some process succeeded to report to the I<recentfile> mechanism
1595 that something has changed. This is why many parts of the code refer
1596 to I<events>, because we merely try to record the I<event> of the
1597 discovery of a change, not the time of the change itself.
1599 All these entries can be devided into two types (denoted by the
1600 C<type> attribute): C<new>s and C<delete>s. Changes and creations are
1601 C<new>s. Deletes are C<delete>s.
1603 Another distinction is for objects with an epoch timestamp and others
1604 without. All files that were already existing on the filesystem before
1605 the I<recentfile> mechanism was installed, get recorded with a
1606 timestamp of zero.
1608 Besides an C<epoch> and a C<type> attribute we find a third one:
1609 C<path>. This path is relative to the directory we find the
1610 I<recentfile> in.
1612 The order of the entries in the I<recentfile> is by decreasing epoch
1613 attribute. These are either 0 or a unique floating point number. They
1614 are zero for events that were happening either before the time that
1615 the I<recentfile> mechanism was set up or were left undiscovered for a
1616 while and never handed over to update(). They are floating point
1617 numbers for all events being regularly handed to update(). And when
1618 the server has ntp running correctly, then the timestamps are
1619 actually decreasing and unique.
1621 =head1 CORRUPTION AND RECOVERY
1623 If the origin host breaks the promise to deliver consistent and
1624 complete I<recentfiles> then the way back to sanity shall be achieved
1625 through either the C<zloop> (still TBD) or traditional rsyncing
1626 between the hosts. For example, if the origin server forgets to deploy
1627 ntp and the clock on it jumps backwards some day, then this would
1628 probably go unnoticed for a while and many software components that
1629 rely on the time never running backwards will make wrong decisions.
1630 After some time this accident would probably still be found in one of
1631 the I<recentfiles> but would become meaningless as soon as a mirror
1632 has run through the sanitizing procedures. Same goes for origin hosts
1633 that forget to include or deliberately omit some files.
1635 =head1 SERIALIZERS
1637 The following suffixes are supported and trigger the use of these
1638 serializers:
1640 =over 4
1642 =item C<< ".yaml" => "YAML::Syck" >>
1644 =item C<< ".json" => "JSON" >>
1646 =item C<< ".sto" => "Storable" >>
1648 =item C<< ".dd" => "Data::Dumper" >>
1650 =back
1652 =cut
1654 BEGIN {
1655 my @pod_lines =
1656 split /\n/, <<'=cut'; %seconds = map { eval } grep {s/^=item\s+C<<(.+)>>$/$1/} @pod_lines; }
1658 =head1 INTERVAL SPEC
1660 An interval spec is a primitive way to express time spans. Normally it
1661 is composed from an integer and a letter.
1663 As a special case, a string that consists only of the single letter
1664 C<Z>, stands for unlimited time.
1666 The following letters express the specified number of seconds:
1668 =over 4
1670 =item C<< s => 1 >>
1672 =item C<< m => 60 >>
1674 =item C<< h => 60*60 >>
1676 =item C<< d => 60*60*24 >>
1678 =item C<< W => 60*60*24*7 >>
1680 =item C<< M => 60*60*24*30 >>
1682 =item C<< Q => 60*60*24*90 >>
1684 =item C<< Y => 60*60*24*365.25 >>
1686 =back
1688 =cut
1690 =head1 BACKGROUND
1692 This is about speeding up rsync operation on large trees to many
1693 places. Uses a small metadata cocktail and pull technology.
1695 =head2 NON-COMPETITORS
1697 File::Mirror JWU/File-Mirror/File-Mirror-0.10.tar.gz only local trees
1698 Mirror::YAML ADAMK/Mirror-YAML-0.03.tar.gz some sort of inner circle
1699 Net::DownloadMirror KNORR/Net-DownloadMirror-0.04.tar.gz FTP sites and stuff
1700 Net::MirrorDir KNORR/Net-MirrorDir-0.05.tar.gz dito
1701 Net::UploadMirror KNORR/Net-UploadMirror-0.06.tar.gz dito
1702 Pushmi::Mirror CLKAO/Pushmi-v1.0.0.tar.gz something SVK
1704 rsnapshot www.rsnapshot.org focus on backup
1705 csync www.csync.org more like unison
1707 =head2 COMPETITORS
1709 The problem to solve which clusters and ftp mirrors and otherwise
1710 replicated datasets like CPAN share: how to transfer only a minimum
1711 amount of data to determine the diff between two hosts.
1713 Normally it takes a long time to determine the diff itself before it
1714 can be transferred. Known solutions at the time of this writing are
1715 csync2, and rsync 3 batch mode.
1717 For many years the best solution was csync2 which solves the
1718 problem by maintining a sqlite database on both ends and talking a
1719 highly sophisticated protocol to quickly determine which files to send
1720 and which to delete at any given point in time. Csync2 is often
1721 inconvenient because the act of syncing demands quite an intimate
1722 relationship between the sender and the receiver and suffers when the
1723 number of syncing sites is large or connections are unreliable.
1725 Rsync 3 batch mode works around these problems by providing rsync-able
1726 batch files which allow receiving nodes to replay the history of the
1727 other nodes. This reduces the need to have an incestuous relation but
1728 it has the disadvantage that these batch files replicate the contents
1729 of the involved files. This seems inappropriate when the nodes already
1730 have a means of communicating over rsync.
1732 rersyncrecent solves this problem with a couple of (usually 2-10)
1733 index files which cover different overlapping time intervals. The
1734 master writes these files and the clients can construct the full tree
1735 from the information contained in them. The most recent index file
1736 usually covers the last seconds or minutes or hours of the tree and
1737 depending on the needs, slaves can rsync every few seconds and then
1738 bring their trees in full sync.
1740 The rersyncrecent mode was developed for CPAN but I hope it is a
1741 convenient and economic general purpose solution. I'm looking forward
1742 to see a CPAN backbone that is only a few seconds behind PAUSE. And
1743 then ... the first FUSE based CPAN filesystem anyone?
1745 =head1 AUTHOR
1747 Andreas König
1749 =head1 BUGS
1751 Please report any bugs or feature requests through the web interface
1753 L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=File-Rsync-Mirror-Recentfile>.
1754 I will be notified, and then you'll automatically be notified of
1755 progress on your bug as I make changes.
1757 =head1 SUPPORT
1759 You can find documentation for this module with the perldoc command.
1761 perldoc File::Rsync::Mirror::Recentfile
1763 You can also look for information at:
1765 =over 4
1767 =item * RT: CPAN's request tracker
1769 L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=File-Rsync-Mirror-Recentfile>
1771 =item * AnnoCPAN: Annotated CPAN documentation
1773 L<http://annocpan.org/dist/File-Rsync-Mirror-Recentfile>
1775 =item * CPAN Ratings
1777 L<http://cpanratings.perl.org/d/File-Rsync-Mirror-Recentfile>
1779 =item * Search CPAN
1781 L<http://search.cpan.org/dist/File-Rsync-Mirror-Recentfile>
1783 =back
1786 =head1 ACKNOWLEDGEMENTS
1788 Thanks to RJBS for module-starter.
1790 =head1 COPYRIGHT & LICENSE
1792 Copyright 2008 Andreas König, all rights reserved.
1794 This program is free software; you can redistribute it and/or modify it
1795 under the same terms as Perl itself.
1798 =cut
1800 1; # End of File::Rsync::Mirror::Recentfile
1802 # Local Variables:
1803 # mode: cperl
1804 # cperl-indent-level: 4
1805 # End: