3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
12 use POSIX
":sys_wait_h";
15 use lib
"__BASEDIR__";
20 BEGIN {noFatalsToBrowser
}
21 use Girocco
::ExecUtil
;
25 return 4 unless defined($cpus) && $cpus ne "" && int($cpus) >= 1;
26 return int($cpus * 2) if $cpus <= 2;
27 return 5 if $cpus < 4;
28 return int($cpus * 1.5) if $cpus <= 10;
35 my $cpus = online_cpus
;
37 my $max_par = $cpus ? _defjobs
($cpus) : 4;
38 my $max_par_intensive = 1;
39 my $load_triggers = $cpus ?
sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
40 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
41 my $restart_delay = 300;
45 my ($update_only, $gc_only, $needs_gc_only);
47 my ($load_trig, $load_untrig);
53 my $p = $job->{'project'};
54 check_project_exists
($job) || return;
55 my $projpath = get_project_path
($p);
56 if ($gc_only || $needs_gc_only ||
57 -e
"$projpath/.nofetch" ||
58 -e
"$projpath/.bypass" ||
59 -e
"$projpath/.bypass_fetch" ||
60 is_mirror_disabled
($p)) {
62 setup_gc
($job) unless ! -e
"$projpath/.nofetch" &&
63 -e
"$projpath/.clone_in_progress" && ! -e
"$projpath/.clone_failed";
66 if (-e
"$projpath/.clone_in_progress" && ! -e
"$projpath/.clone_failed") {
67 job_skip
($job, "initial mirroring not complete yet");
70 if (-e
"$projpath/.clone_failed" || -e
"$projpath/.clone_failed_exceeds_limit") {
71 job_skip
($job, "initial mirroring failed");
72 # Still need to gc clones even if they've failed
76 if (my $ts = is_operation_uptodate
($p, 'lastrefresh', rand_adjust
($Girocco::Config
::min_mirror_interval
))) {
77 job_skip
($job, "not needed right now, last run at $ts");
81 if (is_svn_clone
($p)) {
82 # git svn can be very, very slow at times
83 $job->{'timeout_factor'} = 3;
85 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
90 my $p = $job->{'project'};
91 check_project_exists
($job) || return;
92 my $projpath = get_project_path
($p);
93 if ($update_only || -e
"$projpath/.nogc" || -e
"$projpath/.bypass" ||
94 (-e
"$projpath/.delaygc" && ! -e
"$projpath/.allowgc" && ! -e
"$projpath/.needsgc")) {
99 if (! -e
"$projpath/.needsgc" && ($needs_gc_only ||
100 ($ts = is_operation_uptodate
($p, 'lastgc', rand_adjust
($Girocco::Config
::min_gc_interval
))))) {
101 job_skip
($job, ($needs_gc_only ?
undef : "not needed right now, last run at $ts"));
104 # allow garbage collection to run for longer than an update
105 $job->{'lastgc'} = get_git_config
($projpath, "gitweb.lastgc");
106 $job->{'timeout_factor'} = 2;
107 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
113 project
=> $job->{'project'},
115 command
=> \
&gc_project
,
117 on_success
=> \
&maybe_setup_gc_again
,
121 sub maybe_setup_gc_again
{
123 # If lastgc was set then gc.sh ran successfully and now it's not set
124 # then queue up another run of gc.sh for the project.
125 # However, just in case, no matter what happens with the extra
126 # gc.sh run no more "bonus" runs are possible to avoid any loops.
127 # This allows a "mini" gc that triggers a full gc to have the
128 # full gc run as part of the same --all-once run through instead
129 # of waiting. A very good thing for users of the --all-once option.
130 if ($job->{'lastgc'}) {
131 my $projpath = get_project_path
($job->{'project'});
132 get_git_config
($projpath, "gitweb.lastgc") or
134 project
=> $job->{'project'},
136 command
=> \
&gc_project
,
142 sub check_project_exists
{
144 my $p = $job->{'project'};
145 if (! -d get_project_path
($p)) {
146 job_skip
($job, "non-existent project");
152 sub get_project_path
{
153 "$Girocco::Config::reporoot/".shift().".git";
156 my $_last_config_path;
160 $_last_config_path = "";
161 $_last_config_id = "";
166 my ($projdir, $name) = @_;
167 defined($projdir) && -d
$projdir && -f
"$projdir/config" or return undef;
168 my $cf = "$projdir/config";
169 my @stat = stat($cf);
170 @stat && $stat[7] && $stat[9] or return undef;
171 my $id = join(":", $stat[0], $stat[1], $stat[7], $stat[9]); # dev,ino,size,mtime
172 if ($_last_config_path ne $cf || $_last_config_id ne $id || ref($_last_config) ne 'HASH') {
173 my $data = read_config_file_hash
($cf);
174 defined($data) or $data = {};
175 $_last_config_path = $_last_config_id = "";
176 $_last_config = $data;
177 $_last_config_id = $id;
178 $_last_config_path = $cf;
180 return $_last_config->{$name};
183 sub is_operation_uptodate
{
184 my ($project, $which, $threshold) = @_;
185 my $path = get_project_path
($project);
186 my $timestamp = get_git_config
($path, "gitweb.$which");
187 defined($timestamp) or $timestamp = '';
188 my $unix_ts = parse_rfc2822_date
($timestamp) || 0;
189 (time - $unix_ts) <= $threshold ?
$timestamp : undef;
192 sub is_mirror_disabled
{
194 my $path = get_project_path
($project);
195 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
196 defined($baseurl) or $baseurl = '';
197 $baseurl =~ s/^\s+//;
198 $baseurl =~ s/\s+$//;
199 return $baseurl eq "" || $baseurl =~ /\s/ || $baseurl =~ /^disabled(?:\s|$)/i;
204 my $path = get_project_path
($project);
205 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
206 defined($baseurl) or $baseurl = '';
207 my $svnurl = get_git_config
($path, 'svn-remote.svn.url');
208 defined($svnurl) or $svnurl = '';
209 return $baseurl =~ /^svn[:+]/i && $svnurl;
217 command
=> \
&update_project
,
218 on_success
=> \
&setup_gc
,
219 on_error
=> \
&setup_gc
,
224 queue_one
($_) for (Girocco
::Project
->get_full_list());
227 ######### Daemon operation {{{1
237 # Kills and reaps the specified pid. Returns exit status ($?) on success
238 # otherwise undef if process could not be killed or reaped
239 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
240 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
241 # advantage of "tee -i" in our update scripts and really anything we're killing
242 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
243 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
246 my $use_pg = shift || 0;
247 # Note that the docs for Perl's kill state that a negative signal
248 # number should be used to kill process groups and that while a
249 # a negative process id (and positive signal number) may also do that
250 # on some platforms, that's not portable.
251 my $pg = $use_pg ?
-1 : 1;
252 my $harsh = time() + 15; # SIGKILL after this delay
253 my $count = kill(2*$pg, $targ); # SIGINT is 2
254 my $reaped = waitpid($targ, WNOHANG
);
255 return undef if $reaped < 0;
256 return $?
if $reaped == $targ;
257 while ($count && time() < $harsh) {
258 select(undef, undef, undef, 0.2);
259 $reaped = waitpid($targ, WNOHANG
);
260 return undef if $reaped < 0;
261 return $?
if $reaped == $targ;
264 $count = kill(9*$pg, $targ); # SIGKILL is 9
265 $reaped = waitpid($targ, WNOHANG
);
266 return undef if $reaped < 0;
267 return $?
if $reaped == $targ;
268 # We should not need to wait to reap a SIGKILL, however, just in case
269 # the system doesn't make a SIGKILL'd process immediately reapable
270 # (perhaps under extremely heavy load) we accomodate a brief delay
271 while ($count && time() < $harsh) {
272 select(undef, undef, undef, 0.2);
273 $reaped = waitpid($targ, WNOHANG
);
274 return undef if $reaped < 0;
275 return $?
if $reaped == $targ;
280 sub handle_softexit
{
281 error
("Waiting for outstanding jobs to finish... ".
282 "^C again to exit immediately");
285 $SIG{'INT'} = \
&handle_exit
;
289 error
("Killing outstanding jobs, please be patient...");
290 $SIG{'TERM'} = 'IGNORE';
292 kill_gently
($_->{'pid'}, 1);
294 unlink $lockfile if ($locked);
300 $opts{'queued_at'} = time;
301 $opts{'dont_run'} = 0;
302 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
310 $job->{'command'}->($job);
311 if ($job->{'dont_run'}) {
320 "[".$job->{'type'}."::".$job->{'project'}."]";
323 # Only one of those per job!
324 sub exec_job_command
{
325 my ($job, $command, $err_only) = @_;
328 $job->{'finished'} = 0;
329 delete $job->{'pid'};
330 if (!defined($pid = fork)) {
331 error
(_job_name
($job) ." Can't fork job: $!");
332 $job->{'finished'} = 1;
337 select(undef, undef, undef, 0.1);
339 open STDIN
, '<', '/dev/null' || do {
340 error
(_job_name
($job) ."Can't read from /dev/null: $!");
344 open STDOUT
, '>', '/dev/null' || do {
345 error
(_job_name
($job) ." Can't write to /dev/null: $!");
349 # New process group so we can keep track of all of its children
350 if (!defined(POSIX
::setpgid
(0, 0))) {
351 error
(_job_name
($job) ." Can't create process group: $!");
356 # Stop perl from complaining
359 $job->{'pid'} = $pid;
360 $job->{'started_at'} = time;
364 my ($job, $msg) = @_;
365 $job->{'dont_run'} = 1;
366 error
(_job_name
($job) ." Skipping job: $msg") unless $quiet || !$msg;
369 sub reap_hanging_jobs
{
371 my $factor = $_->{'timeout_factor'} || 1;
372 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
373 $_->{'finished'} = 1;
374 my $exitcode = kill_gently
($_->{'pid'}, 1);
377 error
(_job_name
($_) ." KILLED due to timeout" .
378 (($exitcode & 0x7f) == 9 ?
" with SIGKILL": ""));
379 push @jobs_killed, _job_name
($_);
386 if (!$job->{'finished'}) {
387 $job->{'on_success'}->($job) if defined($job->{'on_success'});
388 $job->{'finished'} = 1;
391 $job->{'on_error'}->($job) if defined($job->{'on_error'});
395 sub reap_finished_jobs
{
397 my $finished_any = 0;
398 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
399 delete $child->{'killed'};
400 reap_one_job
($child);
404 $pid = waitpid(-1, WNOHANG
);
408 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
410 # any non-zero exit status should trigger on_error
411 $child[0]->{'finished'} = 1 if @child;
414 delete $child[0]->{'pid'};
415 reap_one_job
($child[0]);
418 @running = grep { $_->{'finished'} == 0 } @running;
422 sub have_intensive_jobs
{
423 grep { $_->{'intensive'} == 1 } @running;
427 "[". scalar(localtime) ."] ";
431 my $loadinfo = undef;
432 if ($^O
eq "linux") {
433 # Read /proc/loadavg on Linux
434 open(LOADAV
, '<', '/proc/loadavg') or return undef;
438 $loadinfo = 'load average '.join(" ",(split(/\s+/, $info, 4))[0..2]);
440 # Read the output of uptime everywhere else (works on Linux too)
441 open(LOADAV
, '-|', 'uptime') or return undef;
442 $loadinfo = <LOADAV
>;
445 defined($loadinfo) &&
446 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
447 return (0.0+$1, 0.0+$2, 0.0+$3);
452 my $last_progress = $start;
453 my $last_checkload = $start - 5;
454 my $current_load = $load_trig;
461 my $s = @queue == 1 ?
'' : 's';
462 ferror
("--- Processing %d queued job$s", scalar(@queue));
464 $SIG{'INT'} = \
&handle_softexit
;
465 $SIG{'TERM'} = \
&handle_exit
;
466 while (@queue || @running) {
468 my $proceed_immediately = reap_finished_jobs
();
469 # Check current system load
470 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info
())[0])) {
471 my $current_load = $loadinfo[0];
472 if ($current_load > $load_trig && !$overloaded) {
474 error
("PAUSE: system load is at $current_load > $load_trig") if $progress;
475 } elsif ($current_load < $load_untrig && $overloaded) {
477 error
("RESUME: system load is at $current_load < $load_untrig") if $progress;
480 $load_info = ', paused (load '. $current_load .')';
482 $load_info = ', load '. $current_load;
484 $last_checkload = time;
487 if ($progress && (time - $last_progress) >= 60) {
488 ferror
("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
492 push @run_status, _job_name
($_)." ". (time - $_->{'started_at'}) ."s";
494 error
("STATUS: currently running: ". join(', ', @run_status));
496 $last_progress = time;
498 # Back off if we're too busy
499 if (@running >= $max_par || have_intensive_jobs
() >= $max_par_intensive || !@queue || $overloaded) {
500 sleep 1 unless $proceed_immediately;
504 run_job
(shift(@queue)) if @queue;
507 my $s = $jobs_executed == 1 ?
'' : 's';
508 ferror
("--- Queue processed in %s. %d job$s executed, %d skipped, %d killed.",
509 human_duration
(time - $start), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
513 sub run_perpetually
{
515 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.\n";
517 open LOCK
, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!\n";
524 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
525 chmod 0640, $lockfile;
526 chmod 0644, $lockfile;
527 # check for restart request
528 open LOCK
, '<', $lockfile || die "Lock file '$lockfile' has disappeared!\n";
529 my $request = <LOCK
>;
531 chomp $request if defined($request);
532 if (defined($request) && $request eq "restart") {
538 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
545 ######### Helpers {{{1
548 print STDERR ts
().shift()."\n";
551 error
(sprintf($_[0], @_[1..$#_]));
560 my $reexec = Girocco
::ExecUtil
->new;
561 my $realpath0 = realpath
($0);
563 close(DATA
) if fileno(DATA
);
565 Getopt
::Long
::Configure
('bundling');
567 my $parse_res = GetOptions
(
569 pod2usage
(-verbose
=> 2, -exitval
=> 0, -input
=> $realpath0)},
570 'quiet|q' => sub {++$quiet},
571 'progress|P' => sub {++$progress},
572 'kill-after|k=i' => \
$kill_after,
573 'max-parallel|p=i' => \
$max_par,
574 'max-intensive-parallel|i=i' => \
$max_par_intensive,
575 'load-triggers=s' => \
$load_triggers,
576 'restart-delay|d=i' => \
$restart_delay,
577 'lockfile|l=s' => \
$lockfile,
578 'same-pid' => \
$same_pid,
579 'all-once|a' => \
$all_once,
580 'one|o=s' => sub {$one_once{$_[1]} = 1, push(@one, $_[1])
581 unless exists $one_once{$_[1]}},
582 'update-only' => \
$update_only,
583 'gc-only' => \
$gc_only,
584 'needs-gc-only' => \
$needs_gc_only,
585 ) || pod2usage
(-exitval
=> 2, -input
=> $realpath0);
586 fatal
("Error: can only use one out of --all-once and --one")
587 if $all_once && @one;
588 my $onlycnt = ($update_only?
1:0) + ($gc_only?
1:0) + ($needs_gc_only?
1:0);
589 fatal
("Error: can only use one out of --update-only, --gc-only and --needs-gc-only")
591 fatal
("Error: --update-only, --gc-only or --needs-gc-only requires --all-once or --one")
592 if $onlycnt && !($all_once || @one);
594 delete $ENV{'show_progress'};
596 $ENV{'show_progress'} = 0 if $quiet > 1;
598 $progress = 1 unless $progress;
599 $ENV{'show_progress'} = $progress;
602 $load_triggers = '0,0' unless defined((get_load_info
())[0]);
603 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
606 queue_one
($_) foreach @one;
618 if (run_perpetually
() eq "restart") {
619 error
("Restarting in response to restart request... ");
620 $reexec->reexec($same_pid);
621 error
("Continuing after failed restart: $!");
627 ########## Documentation {{{1
633 jobd.pl - Perform Girocco maintenance jobs
640 -h | --help detailed instructions
641 -q | --quiet run quietly
642 -P | --progress show occasional status updates
643 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
644 -p NUM | --max-parallel NUM how many jobs to run at the same time
645 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
647 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
648 TRIG and resume at load below UNTRIG
649 -d NUM | --restart-delay SECONDS wait for this many seconds between
651 -l FILE | --lockfile FILE create a lockfile in the given
653 --same-pid keep same pid during graceful restart
654 -a | --all-once process the list only once
655 -o PRJNAME | --one PRJNAME process only one project
656 --update-only process mirror updates only
657 --gc-only perform needed garbage collection only
658 --needs-gc-only perform needed mini gc only
666 Print the full description of jobd.pl's options.
670 Suppress non-error messages, e.g. for use when running this task as a cronjob.
671 When given two or more times suppress update ref change lines in logs as well.
675 Show information about the current status of the job queue occasionally. This
676 is automatically enabled if --quiet is not given. When specified two or more
677 times full ref change details will be shown for updates.
679 =item B<--kill-after SECONDS>
681 Kill supervised jobs after a certain time to avoid hanging the daemon.
683 =item B<--max-parallel NUM>
685 Run no more than that many jobs at the same time. The default is the number
686 of cpus * 2 for 1 or 2 cpus, 5 for 3 cpus and int(cpus * 1.5) for 4 cpus or
687 more with the default capped to 16 when more than 10 cpus are detected.
688 If the number of cpus cannot be determined, the default is 4.
690 =item B<--max-intensive-parallel NUM>
692 Run no more than that many resource-hungry jobs at the same time. Right now,
693 this refers to repacking jobs. The default is 1.
695 =item B<--load-triggers TRIG,UNTRIG>
697 If the first system load average (1 minute average) exceeds TRIG, don't queue
698 any more jobs until it goes below UNTRIG. This is currently only supported on
699 Linux and any other platforms that provide an uptime command with load average
702 If both values are zero, load checks are disabled. The default is the number
703 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
704 be determined, the default is 6,3.
706 =item B<--restart-delay NUM>
708 After processing the queue, wait this many seconds until the queue is
709 restarted. The default is 300 seconds.
711 =item B<--lockfile FILE>
713 For perpetual operation, specify the full path to a lock file to create and
714 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
715 where $suffix is a 6-character string uniquely determined by the name and
716 nickname of this Girocco instance. The pid of the running jobd instance will
717 be written to the lock file.
721 When performing a graceful restart, keep the same pid rather than switching to
726 Instead of perpetually processing all projects over and over again, process
727 them just once and then exit.
728 Conflicts with B<--one PRJNAME> option.
730 =item B<--one PRJNAME>
732 Process only the given project (given as just the project name without C<.git>
733 suffix) and then exit. May be repeated to process more than one project.
734 Conflicts with B<--all-once> option.
736 =item B<--update-only>
738 Limit processing to only those projects that need a mirror update.
739 Behaves as though every project has a C<.nogc> file present in it.
740 Requires use of B<--all-once> or B<--one PRJNAME> option.
741 Conflicts with B<--gc-only> and B<--needs-gc-only> options.
745 Limit processing to only those projects that need to have garbage collection
746 run on them. Behaves as though every project has a C<.bypass_fetch> file
747 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
748 Conflicts with B<--update-only> and B<--needs-gc-only> options.
750 =item B<--needs-gc-only>
752 Limit processing to only those projects that need to have mini garbage
753 collection run on them. Behaves as though every project with a C<.needsgc>
754 file present in it also has a C<.bypass_fetch> file present in it and as though
755 every project without a C<.needsgc> file present in it has a C<.bypass> file
756 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
757 Conflicts with B<--update-only> and B<--gc-only> options.
763 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
764 all the repositories and updates mirrored repositories and repacks push-mode
765 repositories when needed.