3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
12 use POSIX
":sys_wait_h";
15 use lib
"__BASEDIR__";
20 BEGIN {noFatalsToBrowser
}
21 use Girocco
::ExecUtil
;
26 my $cpus = online_cpus
;
28 my $max_par = $cpus ?
$cpus * 2 : 8;
29 my $max_par_intensive = 1;
30 my $load_triggers = $cpus ?
sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
31 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
32 my $restart_delay = 300;
36 my ($update_only, $gc_only, $needs_gc_only);
38 my ($load_trig, $load_untrig);
44 my $p = $job->{'project'};
45 check_project_exists
($job) || return;
46 if ($gc_only || $needs_gc_only ||
47 -e get_project_path
($p).".nofetch" ||
48 -e get_project_path
($p).".bypass" ||
49 -e get_project_path
($p).".bypass_fetch") {
51 return setup_gc
($job);
53 if (-e get_project_path
($p).".clone_in_progress" && ! -e get_project_path
($p).".clone_failed") {
54 job_skip
($job, "initial mirroring not complete yet");
57 if (-e get_project_path
($p).".clone_failed") {
58 job_skip
($job, "initial mirroring failed");
59 # Still need to gc non top-level clones even if they've failed
60 # otherwise the objects copied into them from the parent will
61 # just accumulate without bound
62 setup_gc
($job) if $p =~ m
,/,;
65 if (my $ts = is_operation_uptodate
($p, 'lastrefresh', rand_adjust
($Girocco::Config
::min_mirror_interval
))) {
66 job_skip
($job, "not needed right now, last run at $ts");
70 if (is_svn_clone
($p)) {
71 # git svn can be very, very slow at times
72 $job->{'timeout_factor'} = 3;
74 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
79 my $p = $job->{'project'};
80 check_project_exists
($job) || return;
81 my $projpath = get_project_path
($p);
82 if ($update_only || -e
"$projpath.nogc" || -e
"$projpath.bypass" ||
83 (-e
"$projpath.delaygc" && ! -e
"$projpath.allowgc" && ! -e
"$projpath.needsgc")) {
88 if (! -e
"$projpath.needsgc" && ($needs_gc_only ||
89 ($ts = is_operation_uptodate
($p, 'lastgc', rand_adjust
($Girocco::Config
::min_gc_interval
))))) {
90 job_skip
($job, ($needs_gc_only ?
undef : "not needed right now, last run at $ts"));
93 # allow garbage collection to run for longer than an update
94 $job->{'lastgc'} = get_git_config
($projpath, "gitweb.lastgc");
95 $job->{'timeout_factor'} = 2;
96 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
102 project
=> $job->{'project'},
104 command
=> \
&gc_project
,
106 on_success
=> \
&maybe_setup_gc_again
,
110 sub maybe_setup_gc_again
{
112 # If lastgc was set then gc.sh ran successfully and now it's not set
113 # then queue up another run of gc.sh for the project.
114 # However, just in case, no matter what happens with the extra
115 # gc.sh run no more "bonus" runs are possible to avoid any loops.
116 # This allows a "mini" gc that triggers a full gc to have the
117 # full gc run as part of the same --all-once run through instead
118 # of waiting. A very good thing for users of the --all-once option.
119 if ($job->{'lastgc'}) {
120 my $projpath = get_project_path
($job->{'project'});
121 get_git_config
($projpath, "gitweb.lastgc") or
123 project
=> $job->{'project'},
125 command
=> \
&gc_project
,
131 sub check_project_exists
{
133 my $p = $job->{'project'};
134 if (!-d get_project_path
($p)) {
135 job_skip
($job, "non-existent project");
141 sub get_project_path
{
142 "$Girocco::Config::reporoot/".shift().".git/";
145 my $_last_config_path;
149 $_last_config_path = "";
150 $_last_config_id = "";
155 my ($projdir, $name) = @_;
156 defined($projdir) && -d
$projdir && -f
"$projdir/config" or return undef;
157 my $cf = "$projdir/config";
158 my @stat = stat($cf);
159 @stat && $stat[7] && $stat[9] or return undef;
160 my $id = join(":", $stat[0], $stat[1], $stat[7], $stat[9]); # dev,ino,size,mtime
161 if ($_last_config_path ne $cf || $_last_config_id ne $id || ref($_last_config) ne 'HASH') {
162 my $data = read_config_file_hash
($cf);
163 defined($data) or $data = {};
164 $_last_config_path = $_last_config_id = "";
165 $_last_config = $data;
166 $_last_config_id = $id;
167 $_last_config_path = $cf;
169 return $_last_config->{$name};
172 sub is_operation_uptodate
{
173 my ($project, $which, $threshold) = @_;
174 my $path = get_project_path
($project);
175 my $timestamp = get_git_config
($path, "gitweb.$which");
176 defined($timestamp) or $timestamp = '';
177 my $unix_ts = parse_rfc2822_date
($timestamp) || 0;
178 (time - $unix_ts) <= $threshold ?
$timestamp : undef;
183 my $path = get_project_path
($project);
184 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
185 defined($baseurl) or $baseurl = '';
186 my $svnurl = get_git_config
($path, 'svn-remote.svn.url');
187 defined($svnurl) or $svnurl = '';
188 return $baseurl =~ /^svn[:+]/i && $svnurl;
196 command
=> \
&update_project
,
197 on_success
=> \
&setup_gc
,
198 on_error
=> \
&setup_gc
,
203 queue_one
($_) for (Girocco
::Project
->get_full_list());
206 ######### Daemon operation {{{1
216 # Kills and reaps the specified pid. Returns exit status ($?) on success
217 # otherwise undef if process could not be killed or reaped
218 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
219 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
220 # advantage of "tee -i" in our update scripts and really anything we're killing
221 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
222 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
225 my $use_pg = shift || 0;
226 # Note that the docs for Perl's kill state that a negative signal
227 # number should be used to kill process groups and that while a
228 # a negative process id (and positive signal number) may also do that
229 # on some platforms, that's not portable.
230 my $pg = $use_pg ?
-1 : 1;
231 my $harsh = time() + 15; # SIGKILL after this delay
232 my $count = kill(2*$pg, $targ); # SIGINT is 2
233 my $reaped = waitpid($targ, WNOHANG
);
234 return undef if $reaped < 0;
235 return $?
if $reaped == $targ;
236 while ($count && time() < $harsh) {
237 select(undef, undef, undef, 0.2);
238 $reaped = waitpid($targ, WNOHANG
);
239 return undef if $reaped < 0;
240 return $?
if $reaped == $targ;
243 $count = kill(9*$pg, $targ); # SIGKILL is 9
244 $reaped = waitpid($targ, WNOHANG
);
245 return undef if $reaped < 0;
246 return $?
if $reaped == $targ;
247 # We should not need to wait to reap a SIGKILL, however, just in case
248 # the system doesn't make a SIGKILL'd process immediately reapable
249 # (perhaps under extremely heavy load) we accomodate a brief delay
250 while ($count && time() < $harsh) {
251 select(undef, undef, undef, 0.2);
252 $reaped = waitpid($targ, WNOHANG
);
253 return undef if $reaped < 0;
254 return $?
if $reaped == $targ;
259 sub handle_softexit
{
260 error
("Waiting for outstanding jobs to finish... ".
261 "^C again to exit immediately");
264 $SIG{'INT'} = \
&handle_exit
;
268 error
("Killing outstanding jobs, please be patient...");
269 $SIG{'TERM'} = 'IGNORE';
271 kill_gently
($_->{'pid'}, 1);
273 unlink $lockfile if ($locked);
279 $opts{'queued_at'} = time;
280 $opts{'dont_run'} = 0;
281 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
289 $job->{'command'}->($job);
290 if ($job->{'dont_run'}) {
299 "[".$job->{'type'}."::".$job->{'project'}."]";
302 # Only one of those per job!
303 sub exec_job_command
{
304 my ($job, $command, $err_only) = @_;
307 $job->{'finished'} = 0;
308 delete $job->{'pid'};
309 if (!defined($pid = fork)) {
310 error
(_job_name
($job) ." Can't fork job: $!");
311 $job->{'finished'} = 1;
316 select(undef, undef, undef, 0.1);
318 open STDIN
, '<', '/dev/null' || do {
319 error
(_job_name
($job) ."Can't read from /dev/null: $!");
323 open STDOUT
, '>', '/dev/null' || do {
324 error
(_job_name
($job) ." Can't write to /dev/null: $!");
328 # New process group so we can keep track of all of its children
329 if (!defined(POSIX
::setpgid
(0, 0))) {
330 error
(_job_name
($job) ." Can't create process group: $!");
335 # Stop perl from complaining
338 $job->{'pid'} = $pid;
339 $job->{'started_at'} = time;
343 my ($job, $msg) = @_;
344 $job->{'dont_run'} = 1;
345 error
(_job_name
($job) ." Skipping job: $msg") unless $quiet || !$msg;
348 sub reap_hanging_jobs
{
350 my $factor = $_->{'timeout_factor'} || 1;
351 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
352 $_->{'finished'} = 1;
353 my $exitcode = kill_gently
($_->{'pid'}, 1);
356 error
(_job_name
($_) ." KILLED due to timeout" .
357 (($exitcode & 0x7f) == 9 ?
" with SIGKILL": ""));
358 push @jobs_killed, _job_name
($_);
365 if (!$job->{'finished'}) {
366 $job->{'on_success'}->($job) if defined($job->{'on_success'});
367 $job->{'finished'} = 1;
370 $job->{'on_error'}->($job) if defined($job->{'on_error'});
374 sub reap_finished_jobs
{
376 my $finished_any = 0;
377 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
378 delete $child->{'killed'};
379 reap_one_job
($child);
383 $pid = waitpid(-1, WNOHANG
);
387 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
389 # any non-zero exit status should trigger on_error
390 $child[0]->{'finished'} = 1 if @child;
393 delete $child[0]->{'pid'};
394 reap_one_job
($child[0]);
397 @running = grep { $_->{'finished'} == 0 } @running;
401 sub have_intensive_jobs
{
402 grep { $_->{'intensive'} == 1 } @running;
406 "[". scalar(localtime) ."] ";
410 if ($^O
eq "linux") {
411 # Read /proc/loadavg on Linux
412 open(LOADAV
, '<', '/proc/loadavg') or return undef;
413 my $loadinfo = <LOADAV
>;
415 return (split(/\s/, $loadinfo, 4))[0..2];
417 # Read the output of uptime everywhere else (works on Linux too)
418 open(LOADAV
, '-|', 'uptime') or return undef;
419 my $loadinfo = <LOADAV
>;
421 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
427 my $last_progress = time;
428 my $last_checkload = time - 5;
429 my $current_load = $load_trig;
436 my $s = @queue == 1 ?
'' : 's';
437 ferror
("--- Processing %d queued job$s", scalar(@queue));
439 $SIG{'INT'} = \
&handle_softexit
;
440 $SIG{'TERM'} = \
&handle_exit
;
441 while (@queue || @running) {
443 my $proceed_immediately = reap_finished_jobs
();
444 # Check current system load
445 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info
())[0])) {
446 my $current_load = $loadinfo[0];
447 if ($current_load > $load_trig && !$overloaded) {
449 error
("PAUSE: system load is at $current_load > $load_trig") if $progress;
450 } elsif ($current_load < $load_untrig && $overloaded) {
452 error
("RESUME: system load is at $current_load < $load_untrig") if $progress;
455 $load_info = ', paused (load '. $current_load .')';
457 $load_info = ', load '. $current_load;
459 $last_checkload = time;
462 if ($progress && (time - $last_progress) >= 60) {
463 ferror
("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
467 push @run_status, _job_name
($_)." ". (time - $_->{'started_at'}) ."s";
469 error
("STATUS: currently running: ". join(', ', @run_status));
471 $last_progress = time;
473 # Back off if we're too busy
474 if (@running >= $max_par || have_intensive_jobs
() >= $max_par_intensive || !@queue || $overloaded) {
475 sleep 1 unless $proceed_immediately;
479 run_job
(shift(@queue)) if @queue;
482 my $s = $jobs_executed == 1 ?
'' : 's';
483 ferror
("--- Queue processed. %d job$s executed, %d skipped, %d killed.", $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
487 sub run_perpetually
{
489 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.\n";
491 open LOCK
, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!\n";
498 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
499 chmod 0640, $lockfile;
500 chmod 0644, $lockfile;
501 # check for restart request
502 open LOCK
, '<', $lockfile || die "Lock file '$lockfile' has disappeared!\n";
503 my $request = <LOCK
>;
505 chomp $request if defined($request);
506 if (defined($request) && $request eq "restart") {
512 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
519 ######### Helpers {{{1
522 print STDERR ts
().shift()."\n";
525 error
(sprintf($_[0], @_[1..$#_]));
534 my $reexec = Girocco
::ExecUtil
->new;
535 my $realpath0 = realpath
($0);
537 close(DATA
) if fileno(DATA
);
539 Getopt
::Long
::Configure
('bundling');
541 my $parse_res = GetOptions
(
543 pod2usage
(-verbose
=> 2, -exitval
=> 0, -input
=> $realpath0)},
544 'quiet|q' => \
$quiet,
545 'progress|P' => \
$progress,
546 'kill-after|k=i' => \
$kill_after,
547 'max-parallel|p=i' => \
$max_par,
548 'max-intensive-parallel|i=i' => \
$max_par_intensive,
549 'load-triggers=s' => \
$load_triggers,
550 'restart-delay|d=i' => \
$restart_delay,
551 'lockfile|l=s' => \
$lockfile,
552 'same-pid' => \
$same_pid,
553 'all-once|a' => \
$all_once,
554 'one|o=s' => sub {$one_once{$_[1]} = 1, push(@one, $_[1])
555 unless exists $one_once{$_[1]}},
556 'update-only' => \
$update_only,
557 'gc-only' => \
$gc_only,
558 'needs-gc-only' => \
$needs_gc_only,
559 ) || pod2usage
(-exitval
=> 2, -input
=> $realpath0);
560 fatal
("Error: can only use one out of --all-once and --one")
561 if $all_once && @one;
562 my $onlycnt = ($update_only?
1:0) + ($gc_only?
1:0) + ($needs_gc_only?
1:0);
563 fatal
("Error: can only use one out of --update-only, --gc-only and --needs-gc-only")
565 fatal
("Error: --update-only, --gc-only or --needs-gc-only requires --all-once or --one")
566 if $onlycnt && !($all_once || @one);
569 $ENV{'show_progress'} = '1';
573 $load_triggers = '0,0' unless defined((get_load_info
())[0]);
574 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
577 queue_one
($_) foreach @one;
589 if (run_perpetually
() eq "restart") {
590 error
("Restarting in response to restart request... ");
591 $reexec->reexec($same_pid);
592 error
("Continuing after failed restart: $!");
598 ########## Documentation {{{1
604 jobd.pl - Perform Girocco maintenance jobs
611 -h | --help detailed instructions
612 -q | --quiet run quietly
613 -P | --progress show occasional status updates
614 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
615 -p NUM | --max-parallel NUM how many jobs to run at the same time
616 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
618 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
619 TRIG and resume at load below UNTRIG
620 -d NUM | --restart-delay SECONDS wait for this many seconds between
622 -l FILE | --lockfile FILE create a lockfile in the given
624 --same-pid keep same pid during graceful restart
625 -a | --all-once process the list only once
626 -o PRJNAME | --one PRJNAME process only one project
627 --update-only process mirror updates only
628 --gc-only perform needed garbage collection only
629 --needs-gc-only perform needed mini gc only
637 Print the full description of jobd.pl's options.
641 Suppress non-error messages, e.g. for use when running this task as a cronjob.
645 Show information about the current status of the job queue occasionally. This
646 is automatically enabled if --quiet is not given.
648 =item B<--kill-after SECONDS>
650 Kill supervised jobs after a certain time to avoid hanging the daemon.
652 =item B<--max-parallel NUM>
654 Run no more than that many jobs at the same time. The default is the number
655 of cpus * 2. If the number of cpus cannot be determined, the default is 8.
657 =item B<--max-intensive-parallel NUM>
659 Run no more than that many resource-hungry jobs at the same time. Right now,
660 this refers to repacking jobs. The default is 1.
662 =item B<--load-triggers TRIG,UNTRIG>
664 If the first system load average (1 minute average) exceeds TRIG, don't queue
665 any more jobs until it goes below UNTRIG. This is currently only supported on
666 Linux and any other platforms that provide an uptime command with load average
669 If both values are zero, load checks are disabled. The default is the number
670 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
671 be determined, the default is 6,3.
673 =item B<--restart-delay NUM>
675 After processing the queue, wait this many seconds until the queue is
676 restarted. The default is 300 seconds.
678 =item B<--lockfile FILE>
680 For perpetual operation, specify the full path to a lock file to create and
681 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
682 where $suffix is a 6-character string uniquely determined by the name and
683 nickname of this Girocco instance. The pid of the running jobd instance will
684 be written to the lock file.
688 When performing a graceful restart, keep the same pid rather than switching to
693 Instead of perpetually processing all projects over and over again, process
694 them just once and then exit.
695 Conflicts with B<--one PRJNAME> option.
697 =item B<--one PRJNAME>
699 Process only the given project (given as just the project name without C<.git>
700 suffix) and then exit. May be repeated to process more than one project.
701 Conflicts with B<--all-once> option.
703 =item B<--update-only>
705 Limit processing to only those projects that need a mirror update.
706 Behaves as though every project has a C<.nogc> file present in it.
707 Requires use of B<--all-once> or B<--one PRJNAME> option.
708 Conflicts with B<--gc-only> and B<--needs-gc-only> options.
712 Limit processing to only those projects that need to have garbage collection
713 run on them. Behaves as though every project has a C<.bypass_fetch> file
714 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
715 Conflicts with B<--update-only> and B<--needs-gc-only> options.
717 =item B<--needs-gc-only>
719 Limit processing to only those projects that need to have mini garbage
720 collection run on them. Behaves as though every project with a C<.needsgc>
721 file present in it also has a C<.bypass_fetch> file present in it and as though
722 every project without a C<.needsgc> file present in it has a C<.bypass> file
723 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
724 Conflicts with B<--update-only> and B<--gc-only> options.
730 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
731 all the repositories and updates mirrored repositories and repacks push-mode
732 repositories when needed.