3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
12 use POSIX
":sys_wait_h";
20 BEGIN {noFatalsToBrowser
}
25 my $cpus = online_cpus
;
27 my $max_par = $cpus ?
$cpus * 2 : 8;
28 my $max_par_intensive = 1;
29 my $load_triggers = $cpus ?
sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
30 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
31 my $restart_delay = 300;
35 my ($load_trig, $load_untrig);
41 my $p = $job->{'project'};
42 check_project_exists
($job) || return;
43 if (-e get_project_path
($p).".nofetch" || -e get_project_path
($p).".bypass" ||
44 -e get_project_path
($p).".bypass_fetch") {
46 return setup_gc
($job);
48 if (-e get_project_path
($p).".clone_in_progress" && ! -e get_project_path
($p).".clone_failed") {
49 job_skip
($job, "initial mirroring not complete yet");
52 if (-e get_project_path
($p).".clone_failed") {
53 job_skip
($job, "initial mirroring failed");
54 # Still need to gc non top-level clones even if they've failed
55 # otherwise the objects copied into them from the parent will
56 # just accumulate without bound
57 setup_gc
($job) if $p =~ m
,/,;
60 if (my $ts = is_operation_uptodate
($p, 'lastrefresh', rand_adjust
($Girocco::Config
::min_mirror_interval
))) {
61 job_skip
($job, "not needed right now, last run at $ts");
65 if (is_svn_clone
($p)) {
66 # git svn can be very, very slow at times
67 $job->{'timeout_factor'} = 3;
69 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
74 my $p = $job->{'project'};
75 check_project_exists
($job) || return;
76 my $projpath = get_project_path
($p);
77 if (-e
"$projpath.nogc" || -e
"$projpath.bypass" ||
78 (-e
"$projpath.delaygc" && ! -e
"$projpath.allowgc" && ! -e
"$projpath.needsgc")) {
83 if (! -e
"$projpath.needsgc" &&
84 ($ts = is_operation_uptodate
($p, 'lastgc', rand_adjust
($Girocco::Config
::min_gc_interval
)))) {
85 job_skip
($job, "not needed right now, last run at $ts");
88 # allow garbage collection to run for longer than an update
89 $job->{'timeout_factor'} = 2;
90 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
96 project
=> $job->{'project'},
98 command
=> \
&gc_project
,
103 sub check_project_exists
{
105 my $p = $job->{'project'};
106 if (!-d get_project_path
($p)) {
107 job_skip
($job, "non-existent project");
113 sub get_project_path
{
114 "$Girocco::Config::reporoot/".shift().".git/";
117 sub is_operation_uptodate
{
118 my ($project, $which, $threshold) = @_;
119 my $path = get_project_path
($project);
120 my $timestamp = get_git
("--git-dir=$path", 'config', "gitweb.$which");
121 defined($timestamp) or $timestamp = '';
123 my $unix_ts = parse_rfc2822_date
($timestamp) || 0;
124 (time - $unix_ts) <= $threshold ?
$timestamp : undef;
129 my $path = get_project_path
($project);
130 my $baseurl = get_git
("--git-dir=$path", 'config', 'gitweb.baseurl');
131 defined($baseurl) or $baseurl = '';
133 my $svnurl = get_git
("--git-dir=$path", 'config', 'svn-remote.svn.url');
134 defined($svnurl) or $svnurl = '';
136 return $baseurl =~ /^svn[:+]/i && $svnurl;
144 command
=> \
&update_project
,
145 on_success
=> \
&setup_gc
,
146 on_error
=> \
&setup_gc
,
151 queue_one
($_) for (Girocco
::Project
->get_full_list());
154 ######### Daemon operation {{{1
164 # Kills and reaps the specified pid. Returns exit status ($?) on success
165 # otherwise undef if process could not be killed or reaped
166 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
167 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
168 # advantage of "tee -i" in our update scripts and really anything we're killing
169 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
170 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
173 my $use_pg = shift || 0;
174 # Note that the docs for Perl's kill state that a negative signal
175 # number should be used to kill process groups and that while a
176 # a negative process id (and positive signal number) may also do that
177 # on some platforms, that's not portable.
178 my $pg = $use_pg ?
-1 : 1;
179 my $harsh = time() + 15; # SIGKILL after this delay
180 my $count = kill(2*$pg, $targ); # SIGINT is 2
181 my $reaped = waitpid($targ, WNOHANG
);
182 return undef if $reaped < 0;
183 return $?
if $reaped == $targ;
184 while ($count && time() < $harsh) {
185 select(undef, undef, undef, 0.2);
186 $reaped = waitpid($targ, WNOHANG
);
187 return undef if $reaped < 0;
188 return $?
if $reaped == $targ;
191 $count = kill(9*$pg, $targ); # SIGKILL is 9
192 $reaped = waitpid($targ, WNOHANG
);
193 return undef if $reaped < 0;
194 return $?
if $reaped == $targ;
195 # We should not need to wait to reap a SIGKILL, however, just in case
196 # the system doesn't make a SIGKILL'd process immediately reapable
197 # (perhaps under extremely heavy load) we accomodate a brief delay
198 while ($count && time() < $harsh) {
199 select(undef, undef, undef, 0.2);
200 $reaped = waitpid($targ, WNOHANG
);
201 return undef if $reaped < 0;
202 return $?
if $reaped == $targ;
207 sub handle_softexit
{
208 error
("Waiting for outstanding jobs to finish... ".
209 "^C again to exit immediately");
212 $SIG{'INT'} = \
&handle_exit
;
216 error
("Killing outstanding jobs, please be patient...");
217 $SIG{'TERM'} = 'IGNORE';
219 kill_gently
($_->{'pid'}, 1);
221 unlink $lockfile if ($locked);
227 $opts{'queued_at'} = time;
228 $opts{'dont_run'} = 0;
229 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
237 $job->{'command'}->($job);
238 if ($job->{'dont_run'}) {
247 "[".$job->{'type'}."::".$job->{'project'}."]";
250 # Only one of those per job!
251 sub exec_job_command
{
252 my ($job, $command, $err_only) = @_;
255 $job->{'finished'} = 0;
256 delete $job->{'pid'};
257 if (!defined($pid = fork)) {
258 error
(_job_name
($job) ." Can't fork job: $!");
259 $job->{'finished'} = 1;
264 select(undef, undef, undef, 0.1);
266 open STDIN
, '<', '/dev/null' || do {
267 error
(_job_name
($job) ."Can't read from /dev/null: $!");
271 open STDOUT
, '>', '/dev/null' || do {
272 error
(_job_name
($job) ." Can't write to /dev/null: $!");
276 # New process group so we can keep track of all of its children
277 if (!defined(POSIX
::setpgid
(0, 0))) {
278 error
(_job_name
($job) ." Can't create process group: $!");
283 # Stop perl from complaining
286 $job->{'pid'} = $pid;
287 $job->{'started_at'} = time;
291 my ($job, $msg) = @_;
292 $job->{'dont_run'} = 1;
293 error
(_job_name
($job) ." Skipping job: $msg") unless $quiet || !$msg;
296 sub reap_hanging_jobs
{
298 my $factor = $_->{'timeout_factor'} || 1;
299 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
300 $_->{'finished'} = 1;
301 my $exitcode = kill_gently
($_->{'pid'}, 1);
304 error
(_job_name
($_) ." KILLED due to timeout" .
305 (($exitcode & 0x7f) == 9 ?
" with SIGKILL": ""));
306 push @jobs_killed, _job_name
($_);
313 if (!$job->{'finished'}) {
314 $job->{'on_success'}->($job) if defined($job->{'on_success'});
315 $job->{'finished'} = 1;
318 $job->{'on_error'}->($job) if defined($job->{'on_error'});
322 sub reap_finished_jobs
{
324 my $finished_any = 0;
325 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
326 delete $child->{'killed'};
327 reap_one_job
($child);
331 $pid = waitpid(-1, WNOHANG
);
335 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
337 # any non-zero exit status should trigger on_error
338 $child[0]->{'finished'} = 1 if @child;
341 delete $child[0]->{'pid'};
342 reap_one_job
($child[0]);
345 @running = grep { $_->{'finished'} == 0 } @running;
349 sub have_intensive_jobs
{
350 grep { $_->{'intensive'} == 1 } @running;
354 "[". scalar(localtime) ."] ";
358 if ($^O
eq "linux") {
359 # Read /proc/loadavg on Linux
360 open(LOADAV
, '<', '/proc/loadavg') or return undef;
361 my $loadinfo = <LOADAV
>;
363 return (split(/\s/, $loadinfo, 4))[0..2];
365 # Read the output of uptime everywhere else (works on Linux too)
366 open(LOADAV
, '-|', 'uptime') or return undef;
367 my $loadinfo = <LOADAV
>;
369 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
375 my $last_progress = time;
376 my $last_checkload = time - 5;
377 my $current_load = $load_trig;
384 my $s = @queue == 1 ?
'' : 's';
385 ferror
("--- Processing %d queued job$s", scalar(@queue));
387 $SIG{'INT'} = \
&handle_softexit
;
388 $SIG{'TERM'} = \
&handle_exit
;
389 while (@queue || @running) {
391 my $proceed_immediately = reap_finished_jobs
();
392 # Check current system load
393 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info
())[0])) {
394 my $current_load = $loadinfo[0];
395 if ($current_load > $load_trig && !$overloaded) {
397 error
("PAUSE: system load is at $current_load > $load_trig") if $progress;
398 } elsif ($current_load < $load_untrig && $overloaded) {
400 error
("RESUME: system load is at $current_load < $load_untrig") if $progress;
403 $load_info = ', paused (load '. $current_load .')';
405 $load_info = ', load '. $current_load;
407 $last_checkload = time;
410 if ($progress && (time - $last_progress) >= 60) {
411 ferror
("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
415 push @run_status, _job_name
($_)." ". (time - $_->{'started_at'}) ."s";
417 error
("STATUS: currently running: ". join(', ', @run_status));
419 $last_progress = time;
421 # Back off if we're too busy
422 if (@running >= $max_par || have_intensive_jobs
() >= $max_par_intensive || !@queue || $overloaded) {
423 sleep 1 unless $proceed_immediately;
427 run_job
(shift(@queue)) if @queue;
430 my $s = $jobs_executed == 1 ?
'' : 's';
431 ferror
("--- Queue processed. %d job$s executed, %d skipped, %d killed.", $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
435 sub run_perpetually
{
437 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.";
439 open LOCK
, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!";
445 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
446 chmod 0444, $lockfile;
447 chmod 0644, $lockfile;
450 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
455 ######### Helpers {{{1
458 print STDERR ts
().shift()."\n";
461 error
(sprintf($_[0], @_[1..$#_]));
470 close(DATA
) if fileno(DATA
);
472 Getopt
::Long
::Configure
('bundling');
473 my $parse_res = GetOptions
(
474 'help|?|h' => sub { pod2usage
(-verbose
=> 2, -exitval
=> 0); },
475 'quiet|q' => \
$quiet,
476 'progress|P' => \
$progress,
477 'kill-after|k=i' => \
$kill_after,
478 'max-parallel|p=i' => \
$max_par,
479 'max-intensive-parallel|i=i' => \
$max_par_intensive,
480 'load-triggers=s' => \
$load_triggers,
481 'restart-delay|d=i' => \
$restart_delay,
482 'lockfile|l=s' => \
$lockfile,
483 'all-once|a' => \
$all_once,
486 fatal
("Error: can only use one out of --all-once and --one")
487 if ($all_once && $one);
490 $ENV{'show_progress'} = '1';
494 $load_triggers = '0,0' unless defined((get_load_info
())[0]);
495 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
511 ########## Documentation {{{1
517 jobd.pl - Perform Girocco maintenance jobs
524 -h | --help detailed instructions
525 -q | --quiet run quietly
526 -P | --progress show occasional status updates
527 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
528 -p NUM | --max-parallel NUM how many jobs to run at the same time
529 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
531 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
532 TRIG and resume at load below UNTRIG
533 -d NUM | --restart-delay SECONDS wait for this many seconds between
535 -l FILE | --lockfile FILE create a lockfile in the given
537 -a | --all-once process the list only once
538 -o PRJNAME | --one PRJNAME process only one project
546 Print the full description of jobd.pl's options.
550 Suppress non-error messages, e.g. for use when running this task as a cronjob.
554 Show information about the current status of the job queue occasionally. This
555 is automatically enabled if --quiet is not given.
557 =item B<--kill-after SECONDS>
559 Kill supervised jobs after a certain time to avoid hanging the daemon.
561 =item B<--max-parallel NUM>
563 Run no more than that many jobs at the same time. The default is the number
564 of cpus * 2. If the number of cpus cannot be determined, the default is 8.
566 =item B<--max-intensive-parallel NUM>
568 Run no more than that many resource-hungry jobs at the same time. Right now,
569 this refers to repacking jobs. The default is 1.
571 =item B<--load-triggers TRIG,UNTRIG>
573 If the first system load average (1 minute average) exceeds TRIG, don't queue
574 any more jobs until it goes below UNTRIG. This is currently only supported on
575 Linux and any other platforms that provide an uptime command with load average
578 If both values are zero, load checks are disabled. The default is the number
579 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
580 be determined, the default is 6,3.
582 =item B<--restart-delay NUM>
584 After processing the queue, wait this many seconds until the queue is
585 restarted. The default is 300 seconds.
587 =item B<--lockfile FILE>
589 For perpetual operation, specify the full path to a lock file to create and
590 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
591 where $suffix is a 6-character string uniquely determined by the name and
592 nicknme of this Girocco instance. The pid of the running jobd instance will
593 be written to the lock file.
597 Instead of perpetually processing all projects over and over again, process
598 them just once and then exit.
600 =item B<--one PRJNAME>
602 Process only the given project (given as just the project name without C<.git>
603 suffix) and then exit.
609 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
610 all the repositories and updates mirrored repositories and repacks push-mode
611 repositories when needed.