3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
12 use POSIX
":sys_wait_h";
15 use lib
"__BASEDIR__";
20 BEGIN {noFatalsToBrowser
}
21 use Girocco
::ExecUtil
;
26 my $cpus = online_cpus
;
28 my $max_par = $cpus ?
$cpus * 2 : 8;
29 my $max_par_intensive = 1;
30 my $load_triggers = $cpus ?
sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
31 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
32 my $restart_delay = 300;
37 my ($load_trig, $load_untrig);
43 my $p = $job->{'project'};
44 check_project_exists
($job) || return;
45 if (-e get_project_path
($p).".nofetch" || -e get_project_path
($p).".bypass" ||
46 -e get_project_path
($p).".bypass_fetch") {
48 return setup_gc
($job);
50 if (-e get_project_path
($p).".clone_in_progress" && ! -e get_project_path
($p).".clone_failed") {
51 job_skip
($job, "initial mirroring not complete yet");
54 if (-e get_project_path
($p).".clone_failed") {
55 job_skip
($job, "initial mirroring failed");
56 # Still need to gc non top-level clones even if they've failed
57 # otherwise the objects copied into them from the parent will
58 # just accumulate without bound
59 setup_gc
($job) if $p =~ m
,/,;
62 if (my $ts = is_operation_uptodate
($p, 'lastrefresh', rand_adjust
($Girocco::Config
::min_mirror_interval
))) {
63 job_skip
($job, "not needed right now, last run at $ts");
67 if (is_svn_clone
($p)) {
68 # git svn can be very, very slow at times
69 $job->{'timeout_factor'} = 3;
71 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
76 my $p = $job->{'project'};
77 check_project_exists
($job) || return;
78 my $projpath = get_project_path
($p);
79 if (-e
"$projpath.nogc" || -e
"$projpath.bypass" ||
80 (-e
"$projpath.delaygc" && ! -e
"$projpath.allowgc" && ! -e
"$projpath.needsgc")) {
85 if (! -e
"$projpath.needsgc" &&
86 ($ts = is_operation_uptodate
($p, 'lastgc', rand_adjust
($Girocco::Config
::min_gc_interval
)))) {
87 job_skip
($job, "not needed right now, last run at $ts");
90 # allow garbage collection to run for longer than an update
91 $job->{'lastgc'} = get_git_config
($projpath, "gitweb.lastgc");
92 $job->{'timeout_factor'} = 2;
93 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
99 project
=> $job->{'project'},
101 command
=> \
&gc_project
,
103 on_success
=> \
&maybe_setup_gc_again
,
107 sub maybe_setup_gc_again
{
109 # If lastgc was set then gc.sh ran successfully and now it's not set
110 # then queue up another run of gc.sh for the project.
111 # However, just in case, no matter what happens with the extra
112 # gc.sh run no more "bonus" runs are possible to avoid any loops.
113 # This allows a "mini" gc that triggers a full gc to have the
114 # full gc run as part of the same --all-once run through instead
115 # of waiting. A very good thing for users of the --all-once option.
116 if ($job->{'lastgc'}) {
117 my $projpath = get_project_path
($job->{'project'});
118 get_git_config
($projpath, "gitweb.lastgc") or
120 project
=> $job->{'project'},
122 command
=> \
&gc_project
,
128 sub check_project_exists
{
130 my $p = $job->{'project'};
131 if (!-d get_project_path
($p)) {
132 job_skip
($job, "non-existent project");
138 sub get_project_path
{
139 "$Girocco::Config::reporoot/".shift().".git/";
142 my $_last_config_path;
146 $_last_config_path = "";
147 $_last_config_id = "";
152 my ($projdir, $name) = @_;
153 defined($projdir) && -d
$projdir && -f
"$projdir/config" or return undef;
154 my $cf = "$projdir/config";
155 my @stat = stat($cf);
156 @stat && $stat[7] && $stat[9] or return undef;
157 my $id = join(":", $stat[0], $stat[1], $stat[7], $stat[9]); # dev,ino,size,mtime
158 if ($_last_config_path ne $cf || $_last_config_id ne $id || ref($_last_config) ne 'HASH') {
159 my $data = read_config_file_hash
($cf);
160 defined($data) or $data = {};
161 $_last_config_path = $_last_config_id = "";
162 $_last_config = $data;
163 $_last_config_id = $id;
164 $_last_config_path = $cf;
166 return $_last_config->{$name};
169 sub is_operation_uptodate
{
170 my ($project, $which, $threshold) = @_;
171 my $path = get_project_path
($project);
172 my $timestamp = get_git_config
($path, "gitweb.$which");
173 defined($timestamp) or $timestamp = '';
174 my $unix_ts = parse_rfc2822_date
($timestamp) || 0;
175 (time - $unix_ts) <= $threshold ?
$timestamp : undef;
180 my $path = get_project_path
($project);
181 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
182 defined($baseurl) or $baseurl = '';
183 my $svnurl = get_git_config
($path, 'svn-remote.svn.url');
184 defined($svnurl) or $svnurl = '';
185 return $baseurl =~ /^svn[:+]/i && $svnurl;
193 command
=> \
&update_project
,
194 on_success
=> \
&setup_gc
,
195 on_error
=> \
&setup_gc
,
200 queue_one
($_) for (Girocco
::Project
->get_full_list());
203 ######### Daemon operation {{{1
213 # Kills and reaps the specified pid. Returns exit status ($?) on success
214 # otherwise undef if process could not be killed or reaped
215 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
216 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
217 # advantage of "tee -i" in our update scripts and really anything we're killing
218 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
219 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
222 my $use_pg = shift || 0;
223 # Note that the docs for Perl's kill state that a negative signal
224 # number should be used to kill process groups and that while a
225 # a negative process id (and positive signal number) may also do that
226 # on some platforms, that's not portable.
227 my $pg = $use_pg ?
-1 : 1;
228 my $harsh = time() + 15; # SIGKILL after this delay
229 my $count = kill(2*$pg, $targ); # SIGINT is 2
230 my $reaped = waitpid($targ, WNOHANG
);
231 return undef if $reaped < 0;
232 return $?
if $reaped == $targ;
233 while ($count && time() < $harsh) {
234 select(undef, undef, undef, 0.2);
235 $reaped = waitpid($targ, WNOHANG
);
236 return undef if $reaped < 0;
237 return $?
if $reaped == $targ;
240 $count = kill(9*$pg, $targ); # SIGKILL is 9
241 $reaped = waitpid($targ, WNOHANG
);
242 return undef if $reaped < 0;
243 return $?
if $reaped == $targ;
244 # We should not need to wait to reap a SIGKILL, however, just in case
245 # the system doesn't make a SIGKILL'd process immediately reapable
246 # (perhaps under extremely heavy load) we accomodate a brief delay
247 while ($count && time() < $harsh) {
248 select(undef, undef, undef, 0.2);
249 $reaped = waitpid($targ, WNOHANG
);
250 return undef if $reaped < 0;
251 return $?
if $reaped == $targ;
256 sub handle_softexit
{
257 error
("Waiting for outstanding jobs to finish... ".
258 "^C again to exit immediately");
261 $SIG{'INT'} = \
&handle_exit
;
265 error
("Killing outstanding jobs, please be patient...");
266 $SIG{'TERM'} = 'IGNORE';
268 kill_gently
($_->{'pid'}, 1);
270 unlink $lockfile if ($locked);
276 $opts{'queued_at'} = time;
277 $opts{'dont_run'} = 0;
278 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
286 $job->{'command'}->($job);
287 if ($job->{'dont_run'}) {
296 "[".$job->{'type'}."::".$job->{'project'}."]";
299 # Only one of those per job!
300 sub exec_job_command
{
301 my ($job, $command, $err_only) = @_;
304 $job->{'finished'} = 0;
305 delete $job->{'pid'};
306 if (!defined($pid = fork)) {
307 error
(_job_name
($job) ." Can't fork job: $!");
308 $job->{'finished'} = 1;
313 select(undef, undef, undef, 0.1);
315 open STDIN
, '<', '/dev/null' || do {
316 error
(_job_name
($job) ."Can't read from /dev/null: $!");
320 open STDOUT
, '>', '/dev/null' || do {
321 error
(_job_name
($job) ." Can't write to /dev/null: $!");
325 # New process group so we can keep track of all of its children
326 if (!defined(POSIX
::setpgid
(0, 0))) {
327 error
(_job_name
($job) ." Can't create process group: $!");
332 # Stop perl from complaining
335 $job->{'pid'} = $pid;
336 $job->{'started_at'} = time;
340 my ($job, $msg) = @_;
341 $job->{'dont_run'} = 1;
342 error
(_job_name
($job) ." Skipping job: $msg") unless $quiet || !$msg;
345 sub reap_hanging_jobs
{
347 my $factor = $_->{'timeout_factor'} || 1;
348 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
349 $_->{'finished'} = 1;
350 my $exitcode = kill_gently
($_->{'pid'}, 1);
353 error
(_job_name
($_) ." KILLED due to timeout" .
354 (($exitcode & 0x7f) == 9 ?
" with SIGKILL": ""));
355 push @jobs_killed, _job_name
($_);
362 if (!$job->{'finished'}) {
363 $job->{'on_success'}->($job) if defined($job->{'on_success'});
364 $job->{'finished'} = 1;
367 $job->{'on_error'}->($job) if defined($job->{'on_error'});
371 sub reap_finished_jobs
{
373 my $finished_any = 0;
374 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
375 delete $child->{'killed'};
376 reap_one_job
($child);
380 $pid = waitpid(-1, WNOHANG
);
384 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
386 # any non-zero exit status should trigger on_error
387 $child[0]->{'finished'} = 1 if @child;
390 delete $child[0]->{'pid'};
391 reap_one_job
($child[0]);
394 @running = grep { $_->{'finished'} == 0 } @running;
398 sub have_intensive_jobs
{
399 grep { $_->{'intensive'} == 1 } @running;
403 "[". scalar(localtime) ."] ";
407 if ($^O
eq "linux") {
408 # Read /proc/loadavg on Linux
409 open(LOADAV
, '<', '/proc/loadavg') or return undef;
410 my $loadinfo = <LOADAV
>;
412 return (split(/\s/, $loadinfo, 4))[0..2];
414 # Read the output of uptime everywhere else (works on Linux too)
415 open(LOADAV
, '-|', 'uptime') or return undef;
416 my $loadinfo = <LOADAV
>;
418 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
424 my $last_progress = time;
425 my $last_checkload = time - 5;
426 my $current_load = $load_trig;
433 my $s = @queue == 1 ?
'' : 's';
434 ferror
("--- Processing %d queued job$s", scalar(@queue));
436 $SIG{'INT'} = \
&handle_softexit
;
437 $SIG{'TERM'} = \
&handle_exit
;
438 while (@queue || @running) {
440 my $proceed_immediately = reap_finished_jobs
();
441 # Check current system load
442 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info
())[0])) {
443 my $current_load = $loadinfo[0];
444 if ($current_load > $load_trig && !$overloaded) {
446 error
("PAUSE: system load is at $current_load > $load_trig") if $progress;
447 } elsif ($current_load < $load_untrig && $overloaded) {
449 error
("RESUME: system load is at $current_load < $load_untrig") if $progress;
452 $load_info = ', paused (load '. $current_load .')';
454 $load_info = ', load '. $current_load;
456 $last_checkload = time;
459 if ($progress && (time - $last_progress) >= 60) {
460 ferror
("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
464 push @run_status, _job_name
($_)." ". (time - $_->{'started_at'}) ."s";
466 error
("STATUS: currently running: ". join(', ', @run_status));
468 $last_progress = time;
470 # Back off if we're too busy
471 if (@running >= $max_par || have_intensive_jobs
() >= $max_par_intensive || !@queue || $overloaded) {
472 sleep 1 unless $proceed_immediately;
476 run_job
(shift(@queue)) if @queue;
479 my $s = $jobs_executed == 1 ?
'' : 's';
480 ferror
("--- Queue processed. %d job$s executed, %d skipped, %d killed.", $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
484 sub run_perpetually
{
486 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.\n";
488 open LOCK
, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!\n";
495 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
496 chmod 0640, $lockfile;
497 chmod 0644, $lockfile;
498 # check for restart request
499 open LOCK
, '<', $lockfile || die "Lock file '$lockfile' has disappeared!\n";
500 my $request = <LOCK
>;
502 chomp $request if defined($request);
503 if (defined($request) && $request eq "restart") {
509 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
516 ######### Helpers {{{1
519 print STDERR ts
().shift()."\n";
522 error
(sprintf($_[0], @_[1..$#_]));
531 my $reexec = Girocco
::ExecUtil
->new;
532 my $realpath0 = realpath
($0);
534 close(DATA
) if fileno(DATA
);
536 Getopt
::Long
::Configure
('bundling');
537 my $parse_res = GetOptions
(
539 pod2usage
(-verbose
=> 2, -exitval
=> 0, -input
=> $realpath0)},
540 'quiet|q' => \
$quiet,
541 'progress|P' => \
$progress,
542 'kill-after|k=i' => \
$kill_after,
543 'max-parallel|p=i' => \
$max_par,
544 'max-intensive-parallel|i=i' => \
$max_par_intensive,
545 'load-triggers=s' => \
$load_triggers,
546 'restart-delay|d=i' => \
$restart_delay,
547 'lockfile|l=s' => \
$lockfile,
548 'same-pid' => \
$same_pid,
549 'all-once|a' => \
$all_once,
551 ) || pod2usage
(-exitval
=> 2, -input
=> $realpath0);
552 fatal
("Error: can only use one out of --all-once and --one")
553 if ($all_once && $one);
556 $ENV{'show_progress'} = '1';
560 $load_triggers = '0,0' unless defined((get_load_info
())[0]);
561 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
576 if (run_perpetually
() eq "restart") {
577 error
("Restarting in response to restart request... ");
578 $reexec->reexec($same_pid);
579 error
("Continuing after failed restart: $!");
585 ########## Documentation {{{1
591 jobd.pl - Perform Girocco maintenance jobs
598 -h | --help detailed instructions
599 -q | --quiet run quietly
600 -P | --progress show occasional status updates
601 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
602 -p NUM | --max-parallel NUM how many jobs to run at the same time
603 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
605 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
606 TRIG and resume at load below UNTRIG
607 -d NUM | --restart-delay SECONDS wait for this many seconds between
609 -l FILE | --lockfile FILE create a lockfile in the given
611 --same-pid keep same pid during graceful restart
612 -a | --all-once process the list only once
613 -o PRJNAME | --one PRJNAME process only one project
621 Print the full description of jobd.pl's options.
625 Suppress non-error messages, e.g. for use when running this task as a cronjob.
629 Show information about the current status of the job queue occasionally. This
630 is automatically enabled if --quiet is not given.
632 =item B<--kill-after SECONDS>
634 Kill supervised jobs after a certain time to avoid hanging the daemon.
636 =item B<--max-parallel NUM>
638 Run no more than that many jobs at the same time. The default is the number
639 of cpus * 2. If the number of cpus cannot be determined, the default is 8.
641 =item B<--max-intensive-parallel NUM>
643 Run no more than that many resource-hungry jobs at the same time. Right now,
644 this refers to repacking jobs. The default is 1.
646 =item B<--load-triggers TRIG,UNTRIG>
648 If the first system load average (1 minute average) exceeds TRIG, don't queue
649 any more jobs until it goes below UNTRIG. This is currently only supported on
650 Linux and any other platforms that provide an uptime command with load average
653 If both values are zero, load checks are disabled. The default is the number
654 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
655 be determined, the default is 6,3.
657 =item B<--restart-delay NUM>
659 After processing the queue, wait this many seconds until the queue is
660 restarted. The default is 300 seconds.
662 =item B<--lockfile FILE>
664 For perpetual operation, specify the full path to a lock file to create and
665 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
666 where $suffix is a 6-character string uniquely determined by the name and
667 nickname of this Girocco instance. The pid of the running jobd instance will
668 be written to the lock file.
672 When performing a graceful restart, keep the same pid rather than switching to
677 Instead of perpetually processing all projects over and over again, process
678 them just once and then exit.
680 =item B<--one PRJNAME>
682 Process only the given project (given as just the project name without C<.git>
683 suffix) and then exit.
689 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
690 all the repositories and updates mirrored repositories and repacks push-mode
691 repositories when needed.