3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
12 use POSIX
":sys_wait_h";
21 BEGIN {noFatalsToBrowser
}
22 use Girocco
::ExecUtil
;
27 my $cpus = online_cpus
;
29 my $max_par = $cpus ?
$cpus * 2 : 8;
30 my $max_par_intensive = 1;
31 my $load_triggers = $cpus ?
sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
32 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
33 my $restart_delay = 300;
38 my ($load_trig, $load_untrig);
44 my $p = $job->{'project'};
45 check_project_exists
($job) || return;
46 if (-e get_project_path
($p).".nofetch" || -e get_project_path
($p).".bypass" ||
47 -e get_project_path
($p).".bypass_fetch") {
49 return setup_gc
($job);
51 if (-e get_project_path
($p).".clone_in_progress" && ! -e get_project_path
($p).".clone_failed") {
52 job_skip
($job, "initial mirroring not complete yet");
55 if (-e get_project_path
($p).".clone_failed") {
56 job_skip
($job, "initial mirroring failed");
57 # Still need to gc non top-level clones even if they've failed
58 # otherwise the objects copied into them from the parent will
59 # just accumulate without bound
60 setup_gc
($job) if $p =~ m
,/,;
63 if (my $ts = is_operation_uptodate
($p, 'lastrefresh', rand_adjust
($Girocco::Config
::min_mirror_interval
))) {
64 job_skip
($job, "not needed right now, last run at $ts");
68 if (is_svn_clone
($p)) {
69 # git svn can be very, very slow at times
70 $job->{'timeout_factor'} = 3;
72 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
77 my $p = $job->{'project'};
78 check_project_exists
($job) || return;
79 my $projpath = get_project_path
($p);
80 if (-e
"$projpath.nogc" || -e
"$projpath.bypass" ||
81 (-e
"$projpath.delaygc" && ! -e
"$projpath.allowgc" && ! -e
"$projpath.needsgc")) {
86 if (! -e
"$projpath.needsgc" &&
87 ($ts = is_operation_uptodate
($p, 'lastgc', rand_adjust
($Girocco::Config
::min_gc_interval
)))) {
88 job_skip
($job, "not needed right now, last run at $ts");
91 # allow garbage collection to run for longer than an update
92 $job->{'timeout_factor'} = 2;
93 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
99 project
=> $job->{'project'},
101 command
=> \
&gc_project
,
106 sub check_project_exists
{
108 my $p = $job->{'project'};
109 if (!-d get_project_path
($p)) {
110 job_skip
($job, "non-existent project");
116 sub get_project_path
{
117 "$Girocco::Config::reporoot/".shift().".git/";
120 my $_last_config_path;
124 $_last_config_path = "";
125 $_last_config_id = "";
130 my ($projdir, $name) = @_;
131 defined($projdir) && -d
$projdir && -f
"$projdir/config" or return undef;
132 my $cf = "$projdir/config";
133 my @stat = stat($cf);
134 @stat && $stat[7] && $stat[9] or return undef;
135 my $id = join(":", $stat[0], $stat[1], $stat[7], $stat[9]); # dev,ino,size,mtime
136 if ($_last_config_path ne $cf || $_last_config_id ne $id || ref($_last_config) ne 'HASH') {
137 my $data = read_config_file_hash
($cf);
138 defined($data) or $data = {};
139 $_last_config_path = $_last_config_id = "";
140 $_last_config = $data;
141 $_last_config_id = $id;
142 $_last_config_path = $cf;
144 return $_last_config->{$name};
147 sub is_operation_uptodate
{
148 my ($project, $which, $threshold) = @_;
149 my $path = get_project_path
($project);
150 my $timestamp = get_git_config
($path, "gitweb.$which");
151 defined($timestamp) or $timestamp = '';
152 my $unix_ts = parse_rfc2822_date
($timestamp) || 0;
153 (time - $unix_ts) <= $threshold ?
$timestamp : undef;
158 my $path = get_project_path
($project);
159 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
160 defined($baseurl) or $baseurl = '';
161 my $svnurl = get_git_config
($path, 'svn-remote.svn.url');
162 defined($svnurl) or $svnurl = '';
163 return $baseurl =~ /^svn[:+]/i && $svnurl;
171 command
=> \
&update_project
,
172 on_success
=> \
&setup_gc
,
173 on_error
=> \
&setup_gc
,
178 queue_one
($_) for (Girocco
::Project
->get_full_list());
181 ######### Daemon operation {{{1
191 # Kills and reaps the specified pid. Returns exit status ($?) on success
192 # otherwise undef if process could not be killed or reaped
193 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
194 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
195 # advantage of "tee -i" in our update scripts and really anything we're killing
196 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
197 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
200 my $use_pg = shift || 0;
201 # Note that the docs for Perl's kill state that a negative signal
202 # number should be used to kill process groups and that while a
203 # a negative process id (and positive signal number) may also do that
204 # on some platforms, that's not portable.
205 my $pg = $use_pg ?
-1 : 1;
206 my $harsh = time() + 15; # SIGKILL after this delay
207 my $count = kill(2*$pg, $targ); # SIGINT is 2
208 my $reaped = waitpid($targ, WNOHANG
);
209 return undef if $reaped < 0;
210 return $?
if $reaped == $targ;
211 while ($count && time() < $harsh) {
212 select(undef, undef, undef, 0.2);
213 $reaped = waitpid($targ, WNOHANG
);
214 return undef if $reaped < 0;
215 return $?
if $reaped == $targ;
218 $count = kill(9*$pg, $targ); # SIGKILL is 9
219 $reaped = waitpid($targ, WNOHANG
);
220 return undef if $reaped < 0;
221 return $?
if $reaped == $targ;
222 # We should not need to wait to reap a SIGKILL, however, just in case
223 # the system doesn't make a SIGKILL'd process immediately reapable
224 # (perhaps under extremely heavy load) we accomodate a brief delay
225 while ($count && time() < $harsh) {
226 select(undef, undef, undef, 0.2);
227 $reaped = waitpid($targ, WNOHANG
);
228 return undef if $reaped < 0;
229 return $?
if $reaped == $targ;
234 sub handle_softexit
{
235 error
("Waiting for outstanding jobs to finish... ".
236 "^C again to exit immediately");
239 $SIG{'INT'} = \
&handle_exit
;
243 error
("Killing outstanding jobs, please be patient...");
244 $SIG{'TERM'} = 'IGNORE';
246 kill_gently
($_->{'pid'}, 1);
248 unlink $lockfile if ($locked);
254 $opts{'queued_at'} = time;
255 $opts{'dont_run'} = 0;
256 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
264 $job->{'command'}->($job);
265 if ($job->{'dont_run'}) {
274 "[".$job->{'type'}."::".$job->{'project'}."]";
277 # Only one of those per job!
278 sub exec_job_command
{
279 my ($job, $command, $err_only) = @_;
282 $job->{'finished'} = 0;
283 delete $job->{'pid'};
284 if (!defined($pid = fork)) {
285 error
(_job_name
($job) ." Can't fork job: $!");
286 $job->{'finished'} = 1;
291 select(undef, undef, undef, 0.1);
293 open STDIN
, '<', '/dev/null' || do {
294 error
(_job_name
($job) ."Can't read from /dev/null: $!");
298 open STDOUT
, '>', '/dev/null' || do {
299 error
(_job_name
($job) ." Can't write to /dev/null: $!");
303 # New process group so we can keep track of all of its children
304 if (!defined(POSIX
::setpgid
(0, 0))) {
305 error
(_job_name
($job) ." Can't create process group: $!");
310 # Stop perl from complaining
313 $job->{'pid'} = $pid;
314 $job->{'started_at'} = time;
318 my ($job, $msg) = @_;
319 $job->{'dont_run'} = 1;
320 error
(_job_name
($job) ." Skipping job: $msg") unless $quiet || !$msg;
323 sub reap_hanging_jobs
{
325 my $factor = $_->{'timeout_factor'} || 1;
326 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
327 $_->{'finished'} = 1;
328 my $exitcode = kill_gently
($_->{'pid'}, 1);
331 error
(_job_name
($_) ." KILLED due to timeout" .
332 (($exitcode & 0x7f) == 9 ?
" with SIGKILL": ""));
333 push @jobs_killed, _job_name
($_);
340 if (!$job->{'finished'}) {
341 $job->{'on_success'}->($job) if defined($job->{'on_success'});
342 $job->{'finished'} = 1;
345 $job->{'on_error'}->($job) if defined($job->{'on_error'});
349 sub reap_finished_jobs
{
351 my $finished_any = 0;
352 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
353 delete $child->{'killed'};
354 reap_one_job
($child);
358 $pid = waitpid(-1, WNOHANG
);
362 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
364 # any non-zero exit status should trigger on_error
365 $child[0]->{'finished'} = 1 if @child;
368 delete $child[0]->{'pid'};
369 reap_one_job
($child[0]);
372 @running = grep { $_->{'finished'} == 0 } @running;
376 sub have_intensive_jobs
{
377 grep { $_->{'intensive'} == 1 } @running;
381 "[". scalar(localtime) ."] ";
385 if ($^O
eq "linux") {
386 # Read /proc/loadavg on Linux
387 open(LOADAV
, '<', '/proc/loadavg') or return undef;
388 my $loadinfo = <LOADAV
>;
390 return (split(/\s/, $loadinfo, 4))[0..2];
392 # Read the output of uptime everywhere else (works on Linux too)
393 open(LOADAV
, '-|', 'uptime') or return undef;
394 my $loadinfo = <LOADAV
>;
396 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
402 my $last_progress = time;
403 my $last_checkload = time - 5;
404 my $current_load = $load_trig;
411 my $s = @queue == 1 ?
'' : 's';
412 ferror
("--- Processing %d queued job$s", scalar(@queue));
414 $SIG{'INT'} = \
&handle_softexit
;
415 $SIG{'TERM'} = \
&handle_exit
;
416 while (@queue || @running) {
418 my $proceed_immediately = reap_finished_jobs
();
419 # Check current system load
420 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info
())[0])) {
421 my $current_load = $loadinfo[0];
422 if ($current_load > $load_trig && !$overloaded) {
424 error
("PAUSE: system load is at $current_load > $load_trig") if $progress;
425 } elsif ($current_load < $load_untrig && $overloaded) {
427 error
("RESUME: system load is at $current_load < $load_untrig") if $progress;
430 $load_info = ', paused (load '. $current_load .')';
432 $load_info = ', load '. $current_load;
434 $last_checkload = time;
437 if ($progress && (time - $last_progress) >= 60) {
438 ferror
("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
442 push @run_status, _job_name
($_)." ". (time - $_->{'started_at'}) ."s";
444 error
("STATUS: currently running: ". join(', ', @run_status));
446 $last_progress = time;
448 # Back off if we're too busy
449 if (@running >= $max_par || have_intensive_jobs
() >= $max_par_intensive || !@queue || $overloaded) {
450 sleep 1 unless $proceed_immediately;
454 run_job
(shift(@queue)) if @queue;
457 my $s = $jobs_executed == 1 ?
'' : 's';
458 ferror
("--- Queue processed. %d job$s executed, %d skipped, %d killed.", $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
462 sub run_perpetually
{
464 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.\n";
466 open LOCK
, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!\n";
473 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
474 chmod 0640, $lockfile;
475 chmod 0644, $lockfile;
476 # check for restart request
477 open LOCK
, '<', $lockfile || die "Lock file '$lockfile' has disappeared!\n";
478 my $request = <LOCK
>;
480 chomp $request if defined($request);
481 if (defined($request) && $request eq "restart") {
487 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
494 ######### Helpers {{{1
497 print STDERR ts
().shift()."\n";
500 error
(sprintf($_[0], @_[1..$#_]));
509 my $reexec = Girocco
::ExecUtil
->new;
510 my $realpath0 = realpath
($0);
512 close(DATA
) if fileno(DATA
);
514 Getopt
::Long
::Configure
('bundling');
515 my $parse_res = GetOptions
(
517 pod2usage
(-verbose
=> 2, -exitval
=> 0, -input
=> $realpath0)},
518 'quiet|q' => \
$quiet,
519 'progress|P' => \
$progress,
520 'kill-after|k=i' => \
$kill_after,
521 'max-parallel|p=i' => \
$max_par,
522 'max-intensive-parallel|i=i' => \
$max_par_intensive,
523 'load-triggers=s' => \
$load_triggers,
524 'restart-delay|d=i' => \
$restart_delay,
525 'lockfile|l=s' => \
$lockfile,
526 'same-pid' => \
$same_pid,
527 'all-once|a' => \
$all_once,
529 ) || pod2usage
(-exitval
=> 2, -input
=> $realpath0);
530 fatal
("Error: can only use one out of --all-once and --one")
531 if ($all_once && $one);
534 $ENV{'show_progress'} = '1';
538 $load_triggers = '0,0' unless defined((get_load_info
())[0]);
539 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
554 if (run_perpetually
() eq "restart") {
555 error
("Restarting in response to restart request... ");
556 $reexec->reexec($same_pid);
557 error
("Continuing after failed restart: $!");
563 ########## Documentation {{{1
569 jobd.pl - Perform Girocco maintenance jobs
576 -h | --help detailed instructions
577 -q | --quiet run quietly
578 -P | --progress show occasional status updates
579 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
580 -p NUM | --max-parallel NUM how many jobs to run at the same time
581 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
583 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
584 TRIG and resume at load below UNTRIG
585 -d NUM | --restart-delay SECONDS wait for this many seconds between
587 -l FILE | --lockfile FILE create a lockfile in the given
589 --same-pid keep same pid during graceful restart
590 -a | --all-once process the list only once
591 -o PRJNAME | --one PRJNAME process only one project
599 Print the full description of jobd.pl's options.
603 Suppress non-error messages, e.g. for use when running this task as a cronjob.
607 Show information about the current status of the job queue occasionally. This
608 is automatically enabled if --quiet is not given.
610 =item B<--kill-after SECONDS>
612 Kill supervised jobs after a certain time to avoid hanging the daemon.
614 =item B<--max-parallel NUM>
616 Run no more than that many jobs at the same time. The default is the number
617 of cpus * 2. If the number of cpus cannot be determined, the default is 8.
619 =item B<--max-intensive-parallel NUM>
621 Run no more than that many resource-hungry jobs at the same time. Right now,
622 this refers to repacking jobs. The default is 1.
624 =item B<--load-triggers TRIG,UNTRIG>
626 If the first system load average (1 minute average) exceeds TRIG, don't queue
627 any more jobs until it goes below UNTRIG. This is currently only supported on
628 Linux and any other platforms that provide an uptime command with load average
631 If both values are zero, load checks are disabled. The default is the number
632 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
633 be determined, the default is 6,3.
635 =item B<--restart-delay NUM>
637 After processing the queue, wait this many seconds until the queue is
638 restarted. The default is 300 seconds.
640 =item B<--lockfile FILE>
642 For perpetual operation, specify the full path to a lock file to create and
643 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
644 where $suffix is a 6-character string uniquely determined by the name and
645 nicknme of this Girocco instance. The pid of the running jobd instance will
646 be written to the lock file.
650 When performing a graceful restart, keep the same pid rather than switching to
655 Instead of perpetually processing all projects over and over again, process
656 them just once and then exit.
658 =item B<--one PRJNAME>
660 Process only the given project (given as just the project name without C<.git>
661 suffix) and then exit.
667 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
668 all the repositories and updates mirrored repositories and repacks push-mode
669 repositories when needed.