3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
12 use POSIX
":sys_wait_h";
15 use lib
"__BASEDIR__";
20 BEGIN {noFatalsToBrowser
}
21 use Girocco
::ExecUtil
;
25 return 4 unless defined($cpus) && $cpus ne "" && int($cpus) >= 1;
26 return int($cpus * 2) if $cpus <= 2;
27 return 5 if $cpus < 4;
28 return int($cpus * 1.5) if $cpus <= 10;
35 my $cpus = online_cpus
;
37 my $max_par = $cpus ? _defjobs
($cpus) : 4;
38 my $max_par_intensive = 1;
39 my $load_triggers = $cpus ?
sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
40 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
41 my $restart_delay = 300;
45 my ($update_only, $gc_only, $needs_gc_only);
47 my ($load_trig, $load_untrig);
53 my $p = $job->{'project'};
54 check_project_exists
($job) || return;
55 my $projpath = get_project_path
($p);
56 if ($gc_only || $needs_gc_only ||
57 -e
"$projpath/.nofetch" ||
58 -e
"$projpath/.bypass" ||
59 -e
"$projpath/.bypass_fetch" ||
60 is_mirror_disabled
($p)) {
62 setup_gc
($job) unless ! -e
"$projpath/.nofetch" &&
63 -e
"$projpath/.clone_in_progress" && ! -e
"$projpath/.clone_failed";
66 if (-e
"$projpath/.clone_in_progress" && ! -e
"$projpath/.clone_failed") {
67 job_skip
($job, "initial mirroring not complete yet");
70 if (-e
"$projpath/.clone_failed") {
71 job_skip
($job, "initial mirroring failed");
72 # Still need to gc clones even if they've failed
76 if (my $ts = is_operation_uptodate
($p, 'lastrefresh', rand_adjust
($Girocco::Config
::min_mirror_interval
))) {
77 job_skip
($job, "not needed right now, last run at $ts");
81 if (is_svn_clone
($p)) {
82 # git svn can be very, very slow at times
83 $job->{'timeout_factor'} = 3;
85 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
90 my $p = $job->{'project'};
91 check_project_exists
($job) || return;
92 my $projpath = get_project_path
($p);
93 if ($update_only || -e
"$projpath/.nogc" || -e
"$projpath/.bypass" ||
94 (-e
"$projpath/.delaygc" && ! -e
"$projpath/.allowgc" && ! -e
"$projpath/.needsgc")) {
99 if (! -e
"$projpath/.needsgc" && ($needs_gc_only ||
100 ($ts = is_operation_uptodate
($p, 'lastgc', rand_adjust
($Girocco::Config
::min_gc_interval
))))) {
101 job_skip
($job, ($needs_gc_only ?
undef : "not needed right now, last run at $ts"));
104 # allow garbage collection to run for longer than an update
105 $job->{'lastgc'} = get_git_config
($projpath, "gitweb.lastgc");
106 $job->{'timeout_factor'} = 2;
107 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
113 project
=> $job->{'project'},
115 command
=> \
&gc_project
,
117 on_success
=> \
&maybe_setup_gc_again
,
121 sub maybe_setup_gc_again
{
123 # If lastgc was set then gc.sh ran successfully and now it's not set
124 # then queue up another run of gc.sh for the project.
125 # However, just in case, no matter what happens with the extra
126 # gc.sh run no more "bonus" runs are possible to avoid any loops.
127 # This allows a "mini" gc that triggers a full gc to have the
128 # full gc run as part of the same --all-once run through instead
129 # of waiting. A very good thing for users of the --all-once option.
130 if ($job->{'lastgc'}) {
131 my $projpath = get_project_path
($job->{'project'});
132 get_git_config
($projpath, "gitweb.lastgc") or
134 project
=> $job->{'project'},
136 command
=> \
&gc_project
,
142 sub check_project_exists
{
144 my $p = $job->{'project'};
145 if (! -d get_project_path
($p)) {
146 job_skip
($job, "non-existent project");
152 sub get_project_path
{
153 "$Girocco::Config::reporoot/".shift().".git";
156 my $_last_config_path;
160 $_last_config_path = "";
161 $_last_config_id = "";
166 my ($projdir, $name) = @_;
167 defined($projdir) && -d
$projdir && -f
"$projdir/config" or return undef;
168 my $cf = "$projdir/config";
169 my @stat = stat($cf);
170 @stat && $stat[7] && $stat[9] or return undef;
171 my $id = join(":", $stat[0], $stat[1], $stat[7], $stat[9]); # dev,ino,size,mtime
172 if ($_last_config_path ne $cf || $_last_config_id ne $id || ref($_last_config) ne 'HASH') {
173 my $data = read_config_file_hash
($cf);
174 defined($data) or $data = {};
175 $_last_config_path = $_last_config_id = "";
176 $_last_config = $data;
177 $_last_config_id = $id;
178 $_last_config_path = $cf;
180 return $_last_config->{$name};
183 sub is_operation_uptodate
{
184 my ($project, $which, $threshold) = @_;
185 my $path = get_project_path
($project);
186 my $timestamp = get_git_config
($path, "gitweb.$which");
187 defined($timestamp) or $timestamp = '';
188 my $unix_ts = parse_rfc2822_date
($timestamp) || 0;
189 (time - $unix_ts) <= $threshold ?
$timestamp : undef;
192 sub is_mirror_disabled
{
194 my $path = get_project_path
($project);
195 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
196 defined($baseurl) or $baseurl = '';
197 $baseurl =~ s/^\s+//;
198 $baseurl =~ s/\s+$//;
199 return $baseurl eq "" || $baseurl =~ /\s/ || $baseurl =~ /^disabled(?:\s|$)/i;
204 my $path = get_project_path
($project);
205 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
206 defined($baseurl) or $baseurl = '';
207 my $svnurl = get_git_config
($path, 'svn-remote.svn.url');
208 defined($svnurl) or $svnurl = '';
209 return $baseurl =~ /^svn[:+]/i && $svnurl;
217 command
=> \
&update_project
,
218 on_success
=> \
&setup_gc
,
219 on_error
=> \
&setup_gc
,
224 queue_one
($_) for (Girocco
::Project
->get_full_list());
227 ######### Daemon operation {{{1
237 # Kills and reaps the specified pid. Returns exit status ($?) on success
238 # otherwise undef if process could not be killed or reaped
239 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
240 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
241 # advantage of "tee -i" in our update scripts and really anything we're killing
242 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
243 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
246 my $use_pg = shift || 0;
247 # Note that the docs for Perl's kill state that a negative signal
248 # number should be used to kill process groups and that while a
249 # a negative process id (and positive signal number) may also do that
250 # on some platforms, that's not portable.
251 my $pg = $use_pg ?
-1 : 1;
252 my $harsh = time() + 15; # SIGKILL after this delay
253 my $count = kill(2*$pg, $targ); # SIGINT is 2
254 my $reaped = waitpid($targ, WNOHANG
);
255 return undef if $reaped < 0;
256 return $?
if $reaped == $targ;
257 while ($count && time() < $harsh) {
258 select(undef, undef, undef, 0.2);
259 $reaped = waitpid($targ, WNOHANG
);
260 return undef if $reaped < 0;
261 return $?
if $reaped == $targ;
264 $count = kill(9*$pg, $targ); # SIGKILL is 9
265 $reaped = waitpid($targ, WNOHANG
);
266 return undef if $reaped < 0;
267 return $?
if $reaped == $targ;
268 # We should not need to wait to reap a SIGKILL, however, just in case
269 # the system doesn't make a SIGKILL'd process immediately reapable
270 # (perhaps under extremely heavy load) we accomodate a brief delay
271 while ($count && time() < $harsh) {
272 select(undef, undef, undef, 0.2);
273 $reaped = waitpid($targ, WNOHANG
);
274 return undef if $reaped < 0;
275 return $?
if $reaped == $targ;
280 sub handle_softexit
{
281 error
("Waiting for outstanding jobs to finish... ".
282 "^C again to exit immediately");
285 $SIG{'INT'} = \
&handle_exit
;
289 error
("Killing outstanding jobs, please be patient...");
290 $SIG{'TERM'} = 'IGNORE';
292 kill_gently
($_->{'pid'}, 1);
294 unlink $lockfile if ($locked);
300 $opts{'queued_at'} = time;
301 $opts{'dont_run'} = 0;
302 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
310 $job->{'command'}->($job);
311 if ($job->{'dont_run'}) {
320 "[".$job->{'type'}."::".$job->{'project'}."]";
323 # Only one of those per job!
324 sub exec_job_command
{
325 my ($job, $command, $err_only) = @_;
328 $job->{'finished'} = 0;
329 delete $job->{'pid'};
330 if (!defined($pid = fork)) {
331 error
(_job_name
($job) ." Can't fork job: $!");
332 $job->{'finished'} = 1;
337 select(undef, undef, undef, 0.1);
339 open STDIN
, '<', '/dev/null' || do {
340 error
(_job_name
($job) ."Can't read from /dev/null: $!");
344 open STDOUT
, '>', '/dev/null' || do {
345 error
(_job_name
($job) ." Can't write to /dev/null: $!");
349 # New process group so we can keep track of all of its children
350 if (!defined(POSIX
::setpgid
(0, 0))) {
351 error
(_job_name
($job) ." Can't create process group: $!");
356 # Stop perl from complaining
359 $job->{'pid'} = $pid;
360 $job->{'started_at'} = time;
364 my ($job, $msg) = @_;
365 $job->{'dont_run'} = 1;
366 error
(_job_name
($job) ." Skipping job: $msg") unless $quiet || !$msg;
369 sub reap_hanging_jobs
{
371 my $factor = $_->{'timeout_factor'} || 1;
372 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
373 $_->{'finished'} = 1;
374 my $exitcode = kill_gently
($_->{'pid'}, 1);
377 error
(_job_name
($_) ." KILLED due to timeout" .
378 (($exitcode & 0x7f) == 9 ?
" with SIGKILL": ""));
379 push @jobs_killed, _job_name
($_);
386 if (!$job->{'finished'}) {
387 $job->{'on_success'}->($job) if defined($job->{'on_success'});
388 $job->{'finished'} = 1;
391 $job->{'on_error'}->($job) if defined($job->{'on_error'});
395 sub reap_finished_jobs
{
397 my $finished_any = 0;
398 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
399 delete $child->{'killed'};
400 reap_one_job
($child);
404 $pid = waitpid(-1, WNOHANG
);
408 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
410 # any non-zero exit status should trigger on_error
411 $child[0]->{'finished'} = 1 if @child;
414 delete $child[0]->{'pid'};
415 reap_one_job
($child[0]);
418 @running = grep { $_->{'finished'} == 0 } @running;
422 sub have_intensive_jobs
{
423 grep { $_->{'intensive'} == 1 } @running;
427 "[". scalar(localtime) ."] ";
431 if ($^O
eq "linux") {
432 # Read /proc/loadavg on Linux
433 open(LOADAV
, '<', '/proc/loadavg') or return undef;
434 my $loadinfo = <LOADAV
>;
436 return (split(/\s/, $loadinfo, 4))[0..2];
438 # Read the output of uptime everywhere else (works on Linux too)
439 open(LOADAV
, '-|', 'uptime') or return undef;
440 my $loadinfo = <LOADAV
>;
442 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
448 my $last_progress = time;
449 my $last_checkload = time - 5;
450 my $current_load = $load_trig;
457 my $s = @queue == 1 ?
'' : 's';
458 ferror
("--- Processing %d queued job$s", scalar(@queue));
460 $SIG{'INT'} = \
&handle_softexit
;
461 $SIG{'TERM'} = \
&handle_exit
;
462 while (@queue || @running) {
464 my $proceed_immediately = reap_finished_jobs
();
465 # Check current system load
466 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info
())[0])) {
467 my $current_load = $loadinfo[0];
468 if ($current_load > $load_trig && !$overloaded) {
470 error
("PAUSE: system load is at $current_load > $load_trig") if $progress;
471 } elsif ($current_load < $load_untrig && $overloaded) {
473 error
("RESUME: system load is at $current_load < $load_untrig") if $progress;
476 $load_info = ', paused (load '. $current_load .')';
478 $load_info = ', load '. $current_load;
480 $last_checkload = time;
483 if ($progress && (time - $last_progress) >= 60) {
484 ferror
("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
488 push @run_status, _job_name
($_)." ". (time - $_->{'started_at'}) ."s";
490 error
("STATUS: currently running: ". join(', ', @run_status));
492 $last_progress = time;
494 # Back off if we're too busy
495 if (@running >= $max_par || have_intensive_jobs
() >= $max_par_intensive || !@queue || $overloaded) {
496 sleep 1 unless $proceed_immediately;
500 run_job
(shift(@queue)) if @queue;
503 my $s = $jobs_executed == 1 ?
'' : 's';
504 ferror
("--- Queue processed. %d job$s executed, %d skipped, %d killed.", $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
508 sub run_perpetually
{
510 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.\n";
512 open LOCK
, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!\n";
519 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
520 chmod 0640, $lockfile;
521 chmod 0644, $lockfile;
522 # check for restart request
523 open LOCK
, '<', $lockfile || die "Lock file '$lockfile' has disappeared!\n";
524 my $request = <LOCK
>;
526 chomp $request if defined($request);
527 if (defined($request) && $request eq "restart") {
533 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
540 ######### Helpers {{{1
543 print STDERR ts
().shift()."\n";
546 error
(sprintf($_[0], @_[1..$#_]));
555 my $reexec = Girocco
::ExecUtil
->new;
556 my $realpath0 = realpath
($0);
558 close(DATA
) if fileno(DATA
);
560 Getopt
::Long
::Configure
('bundling');
562 my $parse_res = GetOptions
(
564 pod2usage
(-verbose
=> 2, -exitval
=> 0, -input
=> $realpath0)},
565 'quiet|q' => sub {++$quiet},
566 'progress|P' => sub {++$progress},
567 'kill-after|k=i' => \
$kill_after,
568 'max-parallel|p=i' => \
$max_par,
569 'max-intensive-parallel|i=i' => \
$max_par_intensive,
570 'load-triggers=s' => \
$load_triggers,
571 'restart-delay|d=i' => \
$restart_delay,
572 'lockfile|l=s' => \
$lockfile,
573 'same-pid' => \
$same_pid,
574 'all-once|a' => \
$all_once,
575 'one|o=s' => sub {$one_once{$_[1]} = 1, push(@one, $_[1])
576 unless exists $one_once{$_[1]}},
577 'update-only' => \
$update_only,
578 'gc-only' => \
$gc_only,
579 'needs-gc-only' => \
$needs_gc_only,
580 ) || pod2usage
(-exitval
=> 2, -input
=> $realpath0);
581 fatal
("Error: can only use one out of --all-once and --one")
582 if $all_once && @one;
583 my $onlycnt = ($update_only?
1:0) + ($gc_only?
1:0) + ($needs_gc_only?
1:0);
584 fatal
("Error: can only use one out of --update-only, --gc-only and --needs-gc-only")
586 fatal
("Error: --update-only, --gc-only or --needs-gc-only requires --all-once or --one")
587 if $onlycnt && !($all_once || @one);
589 delete $ENV{'show_progress'};
591 $ENV{'show_progress'} = 0 if $quiet > 1;
593 $progress = 1 unless $progress;
594 $ENV{'show_progress'} = $progress;
597 $load_triggers = '0,0' unless defined((get_load_info
())[0]);
598 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
601 queue_one
($_) foreach @one;
613 if (run_perpetually
() eq "restart") {
614 error
("Restarting in response to restart request... ");
615 $reexec->reexec($same_pid);
616 error
("Continuing after failed restart: $!");
622 ########## Documentation {{{1
628 jobd.pl - Perform Girocco maintenance jobs
635 -h | --help detailed instructions
636 -q | --quiet run quietly
637 -P | --progress show occasional status updates
638 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
639 -p NUM | --max-parallel NUM how many jobs to run at the same time
640 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
642 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
643 TRIG and resume at load below UNTRIG
644 -d NUM | --restart-delay SECONDS wait for this many seconds between
646 -l FILE | --lockfile FILE create a lockfile in the given
648 --same-pid keep same pid during graceful restart
649 -a | --all-once process the list only once
650 -o PRJNAME | --one PRJNAME process only one project
651 --update-only process mirror updates only
652 --gc-only perform needed garbage collection only
653 --needs-gc-only perform needed mini gc only
661 Print the full description of jobd.pl's options.
665 Suppress non-error messages, e.g. for use when running this task as a cronjob.
666 When given two or more times suppress update ref change lines in logs as well.
670 Show information about the current status of the job queue occasionally. This
671 is automatically enabled if --quiet is not given. When specified two or more
672 times full ref change details will be shown for updates.
674 =item B<--kill-after SECONDS>
676 Kill supervised jobs after a certain time to avoid hanging the daemon.
678 =item B<--max-parallel NUM>
680 Run no more than that many jobs at the same time. The default is the number
681 of cpus * 2 for 1 or 2 cpus, 5 for 3 cpus and int(cpus * 1.5) for 4 cpus or
682 more with the default capped to 16 when more than 10 cpus are detected.
683 If the number of cpus cannot be determined, the default is 4.
685 =item B<--max-intensive-parallel NUM>
687 Run no more than that many resource-hungry jobs at the same time. Right now,
688 this refers to repacking jobs. The default is 1.
690 =item B<--load-triggers TRIG,UNTRIG>
692 If the first system load average (1 minute average) exceeds TRIG, don't queue
693 any more jobs until it goes below UNTRIG. This is currently only supported on
694 Linux and any other platforms that provide an uptime command with load average
697 If both values are zero, load checks are disabled. The default is the number
698 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
699 be determined, the default is 6,3.
701 =item B<--restart-delay NUM>
703 After processing the queue, wait this many seconds until the queue is
704 restarted. The default is 300 seconds.
706 =item B<--lockfile FILE>
708 For perpetual operation, specify the full path to a lock file to create and
709 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
710 where $suffix is a 6-character string uniquely determined by the name and
711 nickname of this Girocco instance. The pid of the running jobd instance will
712 be written to the lock file.
716 When performing a graceful restart, keep the same pid rather than switching to
721 Instead of perpetually processing all projects over and over again, process
722 them just once and then exit.
723 Conflicts with B<--one PRJNAME> option.
725 =item B<--one PRJNAME>
727 Process only the given project (given as just the project name without C<.git>
728 suffix) and then exit. May be repeated to process more than one project.
729 Conflicts with B<--all-once> option.
731 =item B<--update-only>
733 Limit processing to only those projects that need a mirror update.
734 Behaves as though every project has a C<.nogc> file present in it.
735 Requires use of B<--all-once> or B<--one PRJNAME> option.
736 Conflicts with B<--gc-only> and B<--needs-gc-only> options.
740 Limit processing to only those projects that need to have garbage collection
741 run on them. Behaves as though every project has a C<.bypass_fetch> file
742 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
743 Conflicts with B<--update-only> and B<--needs-gc-only> options.
745 =item B<--needs-gc-only>
747 Limit processing to only those projects that need to have mini garbage
748 collection run on them. Behaves as though every project with a C<.needsgc>
749 file present in it also has a C<.bypass_fetch> file present in it and as though
750 every project without a C<.needsgc> file present in it has a C<.bypass> file
751 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
752 Conflicts with B<--update-only> and B<--gc-only> options.
758 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
759 all the repositories and updates mirrored repositories and repacks push-mode
760 repositories when needed.