3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
12 use POSIX
":sys_wait_h";
15 use lib
"__BASEDIR__";
20 BEGIN {noFatalsToBrowser
}
21 use Girocco
::ExecUtil
;
26 my $cpus = online_cpus
;
28 my $max_par = $cpus ?
$cpus * 2 : 8;
29 my $max_par_intensive = 1;
30 my $load_triggers = $cpus ?
sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
31 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
32 my $restart_delay = 300;
36 my ($update_only, $gc_only, $needs_gc_only);
38 my ($load_trig, $load_untrig);
44 my $p = $job->{'project'};
45 check_project_exists
($job) || return;
46 my $projpath = get_project_path
($p);
47 if ($gc_only || $needs_gc_only ||
48 -e
"$projpath/.nofetch" ||
49 -e
"$projpath/.bypass" ||
50 -e
"$projpath/.bypass_fetch" ||
51 is_mirror_disabled
($p)) {
53 return setup_gc
($job);
55 if (-e
"$projpath/.clone_in_progress" && ! -e
"$projpath/.clone_failed") {
56 job_skip
($job, "initial mirroring not complete yet");
59 if (-e
"$projpath/.clone_failed") {
60 job_skip
($job, "initial mirroring failed");
61 # Still need to gc non top-level clones even if they've failed
62 # otherwise the objects copied into them from the parent will
63 # just accumulate without bound
64 setup_gc
($job) if $p =~ m
,/,;
67 if (my $ts = is_operation_uptodate
($p, 'lastrefresh', rand_adjust
($Girocco::Config
::min_mirror_interval
))) {
68 job_skip
($job, "not needed right now, last run at $ts");
72 if (is_svn_clone
($p)) {
73 # git svn can be very, very slow at times
74 $job->{'timeout_factor'} = 3;
76 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
81 my $p = $job->{'project'};
82 check_project_exists
($job) || return;
83 my $projpath = get_project_path
($p);
84 if ($update_only || -e
"$projpath/.nogc" || -e
"$projpath/.bypass" ||
85 (-e
"$projpath/.delaygc" && ! -e
"$projpath/.allowgc" && ! -e
"$projpath/.needsgc")) {
90 if (! -e
"$projpath/.needsgc" && ($needs_gc_only ||
91 ($ts = is_operation_uptodate
($p, 'lastgc', rand_adjust
($Girocco::Config
::min_gc_interval
))))) {
92 job_skip
($job, ($needs_gc_only ?
undef : "not needed right now, last run at $ts"));
95 # allow garbage collection to run for longer than an update
96 $job->{'lastgc'} = get_git_config
($projpath, "gitweb.lastgc");
97 $job->{'timeout_factor'} = 2;
98 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
104 project
=> $job->{'project'},
106 command
=> \
&gc_project
,
108 on_success
=> \
&maybe_setup_gc_again
,
112 sub maybe_setup_gc_again
{
114 # If lastgc was set then gc.sh ran successfully and now it's not set
115 # then queue up another run of gc.sh for the project.
116 # However, just in case, no matter what happens with the extra
117 # gc.sh run no more "bonus" runs are possible to avoid any loops.
118 # This allows a "mini" gc that triggers a full gc to have the
119 # full gc run as part of the same --all-once run through instead
120 # of waiting. A very good thing for users of the --all-once option.
121 if ($job->{'lastgc'}) {
122 my $projpath = get_project_path
($job->{'project'});
123 get_git_config
($projpath, "gitweb.lastgc") or
125 project
=> $job->{'project'},
127 command
=> \
&gc_project
,
133 sub check_project_exists
{
135 my $p = $job->{'project'};
136 if (! -d get_project_path
($p)) {
137 job_skip
($job, "non-existent project");
143 sub get_project_path
{
144 "$Girocco::Config::reporoot/".shift().".git";
147 my $_last_config_path;
151 $_last_config_path = "";
152 $_last_config_id = "";
157 my ($projdir, $name) = @_;
158 defined($projdir) && -d
$projdir && -f
"$projdir/config" or return undef;
159 my $cf = "$projdir/config";
160 my @stat = stat($cf);
161 @stat && $stat[7] && $stat[9] or return undef;
162 my $id = join(":", $stat[0], $stat[1], $stat[7], $stat[9]); # dev,ino,size,mtime
163 if ($_last_config_path ne $cf || $_last_config_id ne $id || ref($_last_config) ne 'HASH') {
164 my $data = read_config_file_hash
($cf);
165 defined($data) or $data = {};
166 $_last_config_path = $_last_config_id = "";
167 $_last_config = $data;
168 $_last_config_id = $id;
169 $_last_config_path = $cf;
171 return $_last_config->{$name};
174 sub is_operation_uptodate
{
175 my ($project, $which, $threshold) = @_;
176 my $path = get_project_path
($project);
177 my $timestamp = get_git_config
($path, "gitweb.$which");
178 defined($timestamp) or $timestamp = '';
179 my $unix_ts = parse_rfc2822_date
($timestamp) || 0;
180 (time - $unix_ts) <= $threshold ?
$timestamp : undef;
183 sub is_mirror_disabled
{
185 my $path = get_project_path
($project);
186 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
187 defined($baseurl) or $baseurl = '';
188 $baseurl =~ s/^\s+//;
189 $baseurl =~ s/\s+$//;
190 return $baseurl eq "" || $baseurl =~ /\s/ || $baseurl =~ /^disabled(?:\s|$)/i;
195 my $path = get_project_path
($project);
196 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
197 defined($baseurl) or $baseurl = '';
198 my $svnurl = get_git_config
($path, 'svn-remote.svn.url');
199 defined($svnurl) or $svnurl = '';
200 return $baseurl =~ /^svn[:+]/i && $svnurl;
208 command
=> \
&update_project
,
209 on_success
=> \
&setup_gc
,
210 on_error
=> \
&setup_gc
,
215 queue_one
($_) for (Girocco
::Project
->get_full_list());
218 ######### Daemon operation {{{1
228 # Kills and reaps the specified pid. Returns exit status ($?) on success
229 # otherwise undef if process could not be killed or reaped
230 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
231 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
232 # advantage of "tee -i" in our update scripts and really anything we're killing
233 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
234 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
237 my $use_pg = shift || 0;
238 # Note that the docs for Perl's kill state that a negative signal
239 # number should be used to kill process groups and that while a
240 # a negative process id (and positive signal number) may also do that
241 # on some platforms, that's not portable.
242 my $pg = $use_pg ?
-1 : 1;
243 my $harsh = time() + 15; # SIGKILL after this delay
244 my $count = kill(2*$pg, $targ); # SIGINT is 2
245 my $reaped = waitpid($targ, WNOHANG
);
246 return undef if $reaped < 0;
247 return $?
if $reaped == $targ;
248 while ($count && time() < $harsh) {
249 select(undef, undef, undef, 0.2);
250 $reaped = waitpid($targ, WNOHANG
);
251 return undef if $reaped < 0;
252 return $?
if $reaped == $targ;
255 $count = kill(9*$pg, $targ); # SIGKILL is 9
256 $reaped = waitpid($targ, WNOHANG
);
257 return undef if $reaped < 0;
258 return $?
if $reaped == $targ;
259 # We should not need to wait to reap a SIGKILL, however, just in case
260 # the system doesn't make a SIGKILL'd process immediately reapable
261 # (perhaps under extremely heavy load) we accomodate a brief delay
262 while ($count && time() < $harsh) {
263 select(undef, undef, undef, 0.2);
264 $reaped = waitpid($targ, WNOHANG
);
265 return undef if $reaped < 0;
266 return $?
if $reaped == $targ;
271 sub handle_softexit
{
272 error
("Waiting for outstanding jobs to finish... ".
273 "^C again to exit immediately");
276 $SIG{'INT'} = \
&handle_exit
;
280 error
("Killing outstanding jobs, please be patient...");
281 $SIG{'TERM'} = 'IGNORE';
283 kill_gently
($_->{'pid'}, 1);
285 unlink $lockfile if ($locked);
291 $opts{'queued_at'} = time;
292 $opts{'dont_run'} = 0;
293 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
301 $job->{'command'}->($job);
302 if ($job->{'dont_run'}) {
311 "[".$job->{'type'}."::".$job->{'project'}."]";
314 # Only one of those per job!
315 sub exec_job_command
{
316 my ($job, $command, $err_only) = @_;
319 $job->{'finished'} = 0;
320 delete $job->{'pid'};
321 if (!defined($pid = fork)) {
322 error
(_job_name
($job) ." Can't fork job: $!");
323 $job->{'finished'} = 1;
328 select(undef, undef, undef, 0.1);
330 open STDIN
, '<', '/dev/null' || do {
331 error
(_job_name
($job) ."Can't read from /dev/null: $!");
335 open STDOUT
, '>', '/dev/null' || do {
336 error
(_job_name
($job) ." Can't write to /dev/null: $!");
340 # New process group so we can keep track of all of its children
341 if (!defined(POSIX
::setpgid
(0, 0))) {
342 error
(_job_name
($job) ." Can't create process group: $!");
347 # Stop perl from complaining
350 $job->{'pid'} = $pid;
351 $job->{'started_at'} = time;
355 my ($job, $msg) = @_;
356 $job->{'dont_run'} = 1;
357 error
(_job_name
($job) ." Skipping job: $msg") unless $quiet || !$msg;
360 sub reap_hanging_jobs
{
362 my $factor = $_->{'timeout_factor'} || 1;
363 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
364 $_->{'finished'} = 1;
365 my $exitcode = kill_gently
($_->{'pid'}, 1);
368 error
(_job_name
($_) ." KILLED due to timeout" .
369 (($exitcode & 0x7f) == 9 ?
" with SIGKILL": ""));
370 push @jobs_killed, _job_name
($_);
377 if (!$job->{'finished'}) {
378 $job->{'on_success'}->($job) if defined($job->{'on_success'});
379 $job->{'finished'} = 1;
382 $job->{'on_error'}->($job) if defined($job->{'on_error'});
386 sub reap_finished_jobs
{
388 my $finished_any = 0;
389 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
390 delete $child->{'killed'};
391 reap_one_job
($child);
395 $pid = waitpid(-1, WNOHANG
);
399 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
401 # any non-zero exit status should trigger on_error
402 $child[0]->{'finished'} = 1 if @child;
405 delete $child[0]->{'pid'};
406 reap_one_job
($child[0]);
409 @running = grep { $_->{'finished'} == 0 } @running;
413 sub have_intensive_jobs
{
414 grep { $_->{'intensive'} == 1 } @running;
418 "[". scalar(localtime) ."] ";
422 if ($^O
eq "linux") {
423 # Read /proc/loadavg on Linux
424 open(LOADAV
, '<', '/proc/loadavg') or return undef;
425 my $loadinfo = <LOADAV
>;
427 return (split(/\s/, $loadinfo, 4))[0..2];
429 # Read the output of uptime everywhere else (works on Linux too)
430 open(LOADAV
, '-|', 'uptime') or return undef;
431 my $loadinfo = <LOADAV
>;
433 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
439 my $last_progress = time;
440 my $last_checkload = time - 5;
441 my $current_load = $load_trig;
448 my $s = @queue == 1 ?
'' : 's';
449 ferror
("--- Processing %d queued job$s", scalar(@queue));
451 $SIG{'INT'} = \
&handle_softexit
;
452 $SIG{'TERM'} = \
&handle_exit
;
453 while (@queue || @running) {
455 my $proceed_immediately = reap_finished_jobs
();
456 # Check current system load
457 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info
())[0])) {
458 my $current_load = $loadinfo[0];
459 if ($current_load > $load_trig && !$overloaded) {
461 error
("PAUSE: system load is at $current_load > $load_trig") if $progress;
462 } elsif ($current_load < $load_untrig && $overloaded) {
464 error
("RESUME: system load is at $current_load < $load_untrig") if $progress;
467 $load_info = ', paused (load '. $current_load .')';
469 $load_info = ', load '. $current_load;
471 $last_checkload = time;
474 if ($progress && (time - $last_progress) >= 60) {
475 ferror
("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
479 push @run_status, _job_name
($_)." ". (time - $_->{'started_at'}) ."s";
481 error
("STATUS: currently running: ". join(', ', @run_status));
483 $last_progress = time;
485 # Back off if we're too busy
486 if (@running >= $max_par || have_intensive_jobs
() >= $max_par_intensive || !@queue || $overloaded) {
487 sleep 1 unless $proceed_immediately;
491 run_job
(shift(@queue)) if @queue;
494 my $s = $jobs_executed == 1 ?
'' : 's';
495 ferror
("--- Queue processed. %d job$s executed, %d skipped, %d killed.", $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
499 sub run_perpetually
{
501 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.\n";
503 open LOCK
, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!\n";
510 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
511 chmod 0640, $lockfile;
512 chmod 0644, $lockfile;
513 # check for restart request
514 open LOCK
, '<', $lockfile || die "Lock file '$lockfile' has disappeared!\n";
515 my $request = <LOCK
>;
517 chomp $request if defined($request);
518 if (defined($request) && $request eq "restart") {
524 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
531 ######### Helpers {{{1
534 print STDERR ts
().shift()."\n";
537 error
(sprintf($_[0], @_[1..$#_]));
546 my $reexec = Girocco
::ExecUtil
->new;
547 my $realpath0 = realpath
($0);
549 close(DATA
) if fileno(DATA
);
551 Getopt
::Long
::Configure
('bundling');
553 my $parse_res = GetOptions
(
555 pod2usage
(-verbose
=> 2, -exitval
=> 0, -input
=> $realpath0)},
556 'quiet|q' => \
$quiet,
557 'progress|P' => \
$progress,
558 'kill-after|k=i' => \
$kill_after,
559 'max-parallel|p=i' => \
$max_par,
560 'max-intensive-parallel|i=i' => \
$max_par_intensive,
561 'load-triggers=s' => \
$load_triggers,
562 'restart-delay|d=i' => \
$restart_delay,
563 'lockfile|l=s' => \
$lockfile,
564 'same-pid' => \
$same_pid,
565 'all-once|a' => \
$all_once,
566 'one|o=s' => sub {$one_once{$_[1]} = 1, push(@one, $_[1])
567 unless exists $one_once{$_[1]}},
568 'update-only' => \
$update_only,
569 'gc-only' => \
$gc_only,
570 'needs-gc-only' => \
$needs_gc_only,
571 ) || pod2usage
(-exitval
=> 2, -input
=> $realpath0);
572 fatal
("Error: can only use one out of --all-once and --one")
573 if $all_once && @one;
574 my $onlycnt = ($update_only?
1:0) + ($gc_only?
1:0) + ($needs_gc_only?
1:0);
575 fatal
("Error: can only use one out of --update-only, --gc-only and --needs-gc-only")
577 fatal
("Error: --update-only, --gc-only or --needs-gc-only requires --all-once or --one")
578 if $onlycnt && !($all_once || @one);
581 $ENV{'show_progress'} = '1';
585 $load_triggers = '0,0' unless defined((get_load_info
())[0]);
586 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
589 queue_one
($_) foreach @one;
601 if (run_perpetually
() eq "restart") {
602 error
("Restarting in response to restart request... ");
603 $reexec->reexec($same_pid);
604 error
("Continuing after failed restart: $!");
610 ########## Documentation {{{1
616 jobd.pl - Perform Girocco maintenance jobs
623 -h | --help detailed instructions
624 -q | --quiet run quietly
625 -P | --progress show occasional status updates
626 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
627 -p NUM | --max-parallel NUM how many jobs to run at the same time
628 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
630 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
631 TRIG and resume at load below UNTRIG
632 -d NUM | --restart-delay SECONDS wait for this many seconds between
634 -l FILE | --lockfile FILE create a lockfile in the given
636 --same-pid keep same pid during graceful restart
637 -a | --all-once process the list only once
638 -o PRJNAME | --one PRJNAME process only one project
639 --update-only process mirror updates only
640 --gc-only perform needed garbage collection only
641 --needs-gc-only perform needed mini gc only
649 Print the full description of jobd.pl's options.
653 Suppress non-error messages, e.g. for use when running this task as a cronjob.
657 Show information about the current status of the job queue occasionally. This
658 is automatically enabled if --quiet is not given.
660 =item B<--kill-after SECONDS>
662 Kill supervised jobs after a certain time to avoid hanging the daemon.
664 =item B<--max-parallel NUM>
666 Run no more than that many jobs at the same time. The default is the number
667 of cpus * 2. If the number of cpus cannot be determined, the default is 8.
669 =item B<--max-intensive-parallel NUM>
671 Run no more than that many resource-hungry jobs at the same time. Right now,
672 this refers to repacking jobs. The default is 1.
674 =item B<--load-triggers TRIG,UNTRIG>
676 If the first system load average (1 minute average) exceeds TRIG, don't queue
677 any more jobs until it goes below UNTRIG. This is currently only supported on
678 Linux and any other platforms that provide an uptime command with load average
681 If both values are zero, load checks are disabled. The default is the number
682 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
683 be determined, the default is 6,3.
685 =item B<--restart-delay NUM>
687 After processing the queue, wait this many seconds until the queue is
688 restarted. The default is 300 seconds.
690 =item B<--lockfile FILE>
692 For perpetual operation, specify the full path to a lock file to create and
693 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
694 where $suffix is a 6-character string uniquely determined by the name and
695 nickname of this Girocco instance. The pid of the running jobd instance will
696 be written to the lock file.
700 When performing a graceful restart, keep the same pid rather than switching to
705 Instead of perpetually processing all projects over and over again, process
706 them just once and then exit.
707 Conflicts with B<--one PRJNAME> option.
709 =item B<--one PRJNAME>
711 Process only the given project (given as just the project name without C<.git>
712 suffix) and then exit. May be repeated to process more than one project.
713 Conflicts with B<--all-once> option.
715 =item B<--update-only>
717 Limit processing to only those projects that need a mirror update.
718 Behaves as though every project has a C<.nogc> file present in it.
719 Requires use of B<--all-once> or B<--one PRJNAME> option.
720 Conflicts with B<--gc-only> and B<--needs-gc-only> options.
724 Limit processing to only those projects that need to have garbage collection
725 run on them. Behaves as though every project has a C<.bypass_fetch> file
726 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
727 Conflicts with B<--update-only> and B<--needs-gc-only> options.
729 =item B<--needs-gc-only>
731 Limit processing to only those projects that need to have mini garbage
732 collection run on them. Behaves as though every project with a C<.needsgc>
733 file present in it also has a C<.bypass_fetch> file present in it and as though
734 every project without a C<.needsgc> file present in it has a C<.bypass> file
735 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
736 Conflicts with B<--update-only> and B<--gc-only> options.
742 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
743 all the repositories and updates mirrored repositories and repacks push-mode
744 repositories when needed.