typos: banish them
[girocco.git] / jobd / jobd.pl
blob53ab5ee45f4d8fd9c929919e210e30a952530771
1 #!/usr/bin/perl
3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
7 use strict;
8 use warnings;
10 use Getopt::Long;
11 use Pod::Usage;
12 use POSIX ":sys_wait_h";
13 use File::Basename;
14 use Cwd qw(realpath);
16 use lib dirname($0);
17 use Girocco::Config;
18 use Girocco::Project;
19 use Girocco::User;
20 use Girocco::Util;
21 BEGIN {noFatalsToBrowser}
22 use Girocco::ExecUtil;
24 # Options
25 my $quiet;
26 my $progress;
27 my $cpus = online_cpus;
28 my $kill_after = 900;
29 my $max_par = $cpus ? $cpus * 2 : 8;
30 my $max_par_intensive = 1;
31 my $load_triggers = $cpus ? sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
32 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
33 my $restart_delay = 300;
34 my $all_once;
35 my $same_pid;
36 my $one;
38 my ($load_trig, $load_untrig);
40 ######### Jobs {{{1
42 sub update_project {
43 my $job = shift;
44 my $p = $job->{'project'};
45 check_project_exists($job) || return;
46 if (-e get_project_path($p).".nofetch" || -e get_project_path($p).".bypass" ||
47 -e get_project_path($p).".bypass_fetch") {
48 job_skip($job);
49 return setup_gc($job);
51 if (-e get_project_path($p).".clone_in_progress" && ! -e get_project_path($p).".clone_failed") {
52 job_skip($job, "initial mirroring not complete yet");
53 return;
55 if (-e get_project_path($p).".clone_failed") {
56 job_skip($job, "initial mirroring failed");
57 # Still need to gc non top-level clones even if they've failed
58 # otherwise the objects copied into them from the parent will
59 # just accumulate without bound
60 setup_gc($job) if $p =~ m,/,;
61 return;
63 if (my $ts = is_operation_uptodate($p, 'lastrefresh', rand_adjust($Girocco::Config::min_mirror_interval))) {
64 job_skip($job, "not needed right now, last run at $ts");
65 setup_gc($job);
66 return;
68 if (is_svn_clone($p)) {
69 # git svn can be very, very slow at times
70 $job->{'timeout_factor'} = 3;
72 exec_job_command($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
75 sub gc_project {
76 my $job = shift;
77 my $p = $job->{'project'};
78 check_project_exists($job) || return;
79 my $projpath = get_project_path($p);
80 if (-e "$projpath.nogc" || -e "$projpath.bypass" ||
81 (-e "$projpath.delaygc" && ! -e "$projpath.allowgc" && ! -e "$projpath.needsgc")) {
82 job_skip($job);
83 return;
85 my $ts;
86 if (! -e "$projpath.needsgc" &&
87 ($ts = is_operation_uptodate($p, 'lastgc', rand_adjust($Girocco::Config::min_gc_interval)))) {
88 job_skip($job, "not needed right now, last run at $ts");
89 return;
91 # allow garbage collection to run for longer than an update
92 $job->{'timeout_factor'} = 2;
93 exec_job_command($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
96 sub setup_gc {
97 my $job = shift;
98 queue_job(
99 project => $job->{'project'},
100 type => 'gc',
101 command => \&gc_project,
102 intensive => 1,
106 sub check_project_exists {
107 my $job = shift;
108 my $p = $job->{'project'};
109 if (!-d get_project_path($p)) {
110 job_skip($job, "non-existent project");
111 return 0;
116 sub get_project_path {
117 "$Girocco::Config::reporoot/".shift().".git/";
120 my $_last_config_path;
121 my $_last_config_id;
122 my $_last_config;
123 BEGIN {
124 $_last_config_path = "";
125 $_last_config_id = "";
126 $_last_config = {};
129 sub get_git_config {
130 my ($projdir, $name) = @_;
131 defined($projdir) && -d $projdir && -f "$projdir/config" or return undef;
132 my $cf = "$projdir/config";
133 my @stat = stat($cf);
134 @stat && $stat[7] && $stat[9] or return undef;
135 my $id = join(":", $stat[0], $stat[1], $stat[7], $stat[9]); # dev,ino,size,mtime
136 if ($_last_config_path ne $cf || $_last_config_id ne $id || ref($_last_config) ne 'HASH') {
137 my $data = read_config_file_hash($cf);
138 defined($data) or $data = {};
139 $_last_config_path = $_last_config_id = "";
140 $_last_config = $data;
141 $_last_config_id = $id;
142 $_last_config_path = $cf;
144 return $_last_config->{$name};
147 sub is_operation_uptodate {
148 my ($project, $which, $threshold) = @_;
149 my $path = get_project_path($project);
150 my $timestamp = get_git_config($path, "gitweb.$which");
151 defined($timestamp) or $timestamp = '';
152 my $unix_ts = parse_rfc2822_date($timestamp) || 0;
153 (time - $unix_ts) <= $threshold ? $timestamp : undef;
156 sub is_svn_clone {
157 my ($project) = @_;
158 my $path = get_project_path($project);
159 my $baseurl = get_git_config($path, 'gitweb.baseurl');
160 defined($baseurl) or $baseurl = '';
161 my $svnurl = get_git_config($path, 'svn-remote.svn.url');
162 defined($svnurl) or $svnurl = '';
163 return $baseurl =~ /^svn[:+]/i && $svnurl;
166 sub queue_one {
167 my $project = shift;
168 queue_job(
169 project => $project,
170 type => 'update',
171 command => \&update_project,
172 on_success => \&setup_gc,
173 on_error => \&setup_gc,
177 sub queue_all {
178 queue_one($_) for (Girocco::Project->get_full_list());
181 ######### Daemon operation {{{1
183 my @queue;
184 my @running;
185 my $perpetual = 1;
186 my $locked = 0;
187 my $jobs_executed;
188 my $jobs_skipped;
189 my @jobs_killed;
191 # Kills and reaps the specified pid. Returns exit status ($?) on success
192 # otherwise undef if process could not be killed or reaped
193 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
194 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
195 # advantage of "tee -i" in our update scripts and really anything we're killing
196 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
197 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
198 sub kill_gently {
199 my $targ = shift;
200 my $use_pg = shift || 0;
201 # Note that the docs for Perl's kill state that a negative signal
202 # number should be used to kill process groups and that while a
203 # a negative process id (and positive signal number) may also do that
204 # on some platforms, that's not portable.
205 my $pg = $use_pg ? -1 : 1;
206 my $harsh = time() + 15; # SIGKILL after this delay
207 my $count = kill(2*$pg, $targ); # SIGINT is 2
208 my $reaped = waitpid($targ, WNOHANG);
209 return undef if $reaped < 0;
210 return $? if $reaped == $targ;
211 while ($count && time() < $harsh) {
212 select(undef, undef, undef, 0.2);
213 $reaped = waitpid($targ, WNOHANG);
214 return undef if $reaped < 0;
215 return $? if $reaped == $targ;
217 $harsh = time() + 2;
218 $count = kill(9*$pg, $targ); # SIGKILL is 9
219 $reaped = waitpid($targ, WNOHANG);
220 return undef if $reaped < 0;
221 return $? if $reaped == $targ;
222 # We should not need to wait to reap a SIGKILL, however, just in case
223 # the system doesn't make a SIGKILL'd process immediately reapable
224 # (perhaps under extremely heavy load) we accomodate a brief delay
225 while ($count && time() < $harsh) {
226 select(undef, undef, undef, 0.2);
227 $reaped = waitpid($targ, WNOHANG);
228 return undef if $reaped < 0;
229 return $? if $reaped == $targ;
231 return undef;
234 sub handle_softexit {
235 error("Waiting for outstanding jobs to finish... ".
236 "^C again to exit immediately");
237 @queue = ();
238 $perpetual = 0;
239 $SIG{'INT'} = \&handle_exit;
242 sub handle_exit {
243 error("Killing outstanding jobs, please be patient...");
244 $SIG{'TERM'} = 'IGNORE';
245 for (@running) {
246 kill_gently($_->{'pid'}, 1);
248 unlink $lockfile if ($locked);
249 exit(0);
252 sub queue_job {
253 my %opts = @_;
254 $opts{'queued_at'} = time;
255 $opts{'dont_run'} = 0;
256 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
257 push @queue, \%opts;
260 sub run_job {
261 my $job = shift;
263 push @running, $job;
264 $job->{'command'}->($job);
265 if ($job->{'dont_run'}) {
266 pop @running;
267 $jobs_skipped++;
268 return;
272 sub _job_name {
273 my $job = shift;
274 "[".$job->{'type'}."::".$job->{'project'}."]";
277 # Only one of those per job!
278 sub exec_job_command {
279 my ($job, $command, $err_only) = @_;
281 my $pid;
282 $job->{'finished'} = 0;
283 delete $job->{'pid'};
284 if (!defined($pid = fork)) {
285 error(_job_name($job) ." Can't fork job: $!");
286 $job->{'finished'} = 1;
287 return;
289 if (!$pid) {
290 # "Prevent" races
291 select(undef, undef, undef, 0.1);
293 open STDIN, '<', '/dev/null' || do {
294 error(_job_name($job) ."Can't read from /dev/null: $!");
295 exit 71; # EX_OSERR
297 if ($err_only) {
298 open STDOUT, '>', '/dev/null' || do {
299 error(_job_name($job) ." Can't write to /dev/null: $!");
300 exit 71; # EX_OSERR
303 # New process group so we can keep track of all of its children
304 if (!defined(POSIX::setpgid(0, 0))) {
305 error(_job_name($job) ." Can't create process group: $!");
306 exit 71; # EX_OSERR
309 exec @$command;
310 # Stop perl from complaining
311 exit 71; # EX_OSERR
313 $job->{'pid'} = $pid;
314 $job->{'started_at'} = time;
317 sub job_skip {
318 my ($job, $msg) = @_;
319 $job->{'dont_run'} = 1;
320 error(_job_name($job) ." Skipping job: $msg") unless $quiet || !$msg;
323 sub reap_hanging_jobs {
324 for (@running) {
325 my $factor = $_->{'timeout_factor'} || 1;
326 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
327 $_->{'finished'} = 1;
328 my $exitcode = kill_gently($_->{'pid'}, 1);
329 delete $_->{'pid'};
330 $_->{'killed'} = 1;
331 error(_job_name($_) ." KILLED due to timeout" .
332 (($exitcode & 0x7f) == 9 ? " with SIGKILL": ""));
333 push @jobs_killed, _job_name($_);
338 sub reap_one_job {
339 my $job = shift;
340 if (!$job->{'finished'}) {
341 $job->{'on_success'}->($job) if defined($job->{'on_success'});
342 $job->{'finished'} = 1;
343 $jobs_executed++;
344 } else {
345 $job->{'on_error'}->($job) if defined($job->{'on_error'});
349 sub reap_finished_jobs {
350 my $pid;
351 my $finished_any = 0;
352 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
353 delete $child->{'killed'};
354 reap_one_job($child);
355 $finished_any = 1;
357 while (1) {
358 $pid = waitpid(-1, WNOHANG);
359 last if $pid <= 0;
360 $finished_any = 1;
362 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
363 if ($?) {
364 # any non-zero exit status should trigger on_error
365 $child[0]->{'finished'} = 1 if @child;
367 if (@child) {
368 delete $child[0]->{'pid'};
369 reap_one_job($child[0]);
372 @running = grep { $_->{'finished'} == 0 } @running;
373 $finished_any;
376 sub have_intensive_jobs {
377 grep { $_->{'intensive'} == 1 } @running;
380 sub ts {
381 "[". scalar(localtime) ."] ";
384 sub get_load_info {
385 if ($^O eq "linux") {
386 # Read /proc/loadavg on Linux
387 open(LOADAV, '<', '/proc/loadavg') or return undef;
388 my $loadinfo = <LOADAV>;
389 close LOADAV;
390 return (split(/\s/, $loadinfo, 4))[0..2];
391 } else {
392 # Read the output of uptime everywhere else (works on Linux too)
393 open(LOADAV, '-|', 'uptime') or return undef;
394 my $loadinfo = <LOADAV>;
395 close LOADAV;
396 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
397 return ($1, $2, $3);
401 sub run_queue {
402 my $last_progress = time;
403 my $last_checkload = time - 5;
404 my $current_load = $load_trig;
405 my $overloaded = 0;
406 my $load_info = '';
407 $jobs_executed = 0;
408 $jobs_skipped = 0;
409 @jobs_killed = ();
410 if ($progress) {
411 my $s = @queue == 1 ? '' : 's';
412 ferror("--- Processing %d queued job$s", scalar(@queue));
414 $SIG{'INT'} = \&handle_softexit;
415 $SIG{'TERM'} = \&handle_exit;
416 while (@queue || @running) {
417 reap_hanging_jobs();
418 my $proceed_immediately = reap_finished_jobs();
419 # Check current system load
420 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info())[0])) {
421 my $current_load = $loadinfo[0];
422 if ($current_load > $load_trig && !$overloaded) {
423 $overloaded = 1;
424 error("PAUSE: system load is at $current_load > $load_trig") if $progress;
425 } elsif ($current_load < $load_untrig && $overloaded) {
426 $overloaded = 0;
427 error("RESUME: system load is at $current_load < $load_untrig") if $progress;
429 if ($overloaded) {
430 $load_info = ', paused (load '. $current_load .')';
431 } else {
432 $load_info = ', load '. $current_load;
434 $last_checkload = time;
436 # Status output
437 if ($progress && (time - $last_progress) >= 60) {
438 ferror("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
439 if (@running) {
440 my @run_status;
441 for (@running) {
442 push @run_status, _job_name($_)." ". (time - $_->{'started_at'}) ."s";
444 error("STATUS: currently running: ". join(', ', @run_status));
446 $last_progress = time;
448 # Back off if we're too busy
449 if (@running >= $max_par || have_intensive_jobs() >= $max_par_intensive || !@queue || $overloaded) {
450 sleep 1 unless $proceed_immediately;
451 next;
453 # Run next
454 run_job(shift(@queue)) if @queue;
456 if ($progress) {
457 my $s = $jobs_executed == 1 ? '' : 's';
458 ferror("--- Queue processed. %d job$s executed, %d skipped, %d killed.", $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
462 sub run_perpetually {
463 if (-e $lockfile) {
464 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.\n";
466 open LOCK, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!\n";
467 print LOCK $$;
468 close LOCK;
469 $locked = 1;
471 my $result = "";
472 while ($perpetual) {
473 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
474 chmod 0640, $lockfile;
475 chmod 0644, $lockfile;
476 # check for restart request
477 open LOCK, '<', $lockfile || die "Lock file '$lockfile' has disappeared!\n";
478 my $request = <LOCK>;
479 close LOCK;
480 chomp $request if defined($request);
481 if (defined($request) && $request eq "restart") {
482 $result = $request;
483 last;
485 queue_all();
486 run_queue();
487 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
489 unlink $lockfile;
490 $locked = 0;
491 return $result;
494 ######### Helpers {{{1
496 sub error($) {
497 print STDERR ts().shift()."\n";
499 sub ferror(@) {
500 error(sprintf($_[0], @_[1..$#_]));
502 sub fatal($) {
503 error(shift);
504 exit 1;
507 ######### Main {{{1
509 my $reexec = Girocco::ExecUtil->new;
510 my $realpath0 = realpath($0);
511 chdir "/";
512 close(DATA) if fileno(DATA);
513 # Parse options
514 Getopt::Long::Configure('bundling');
515 my $parse_res = GetOptions(
516 'help|?|h' => sub {
517 pod2usage(-verbose => 2, -exitval => 0, -input => $realpath0)},
518 'quiet|q' => \$quiet,
519 'progress|P' => \$progress,
520 'kill-after|k=i' => \$kill_after,
521 'max-parallel|p=i' => \$max_par,
522 'max-intensive-parallel|i=i' => \$max_par_intensive,
523 'load-triggers=s' => \$load_triggers,
524 'restart-delay|d=i' => \$restart_delay,
525 'lockfile|l=s' => \$lockfile,
526 'same-pid' => \$same_pid,
527 'all-once|a' => \$all_once,
528 'one|o=s' => \$one,
529 ) || pod2usage(-exitval => 2, -input => $realpath0);
530 fatal("Error: can only use one out of --all-once and --one")
531 if ($all_once && $one);
533 unless ($quiet) {
534 $ENV{'show_progress'} = '1';
535 $progress = 1;
538 $load_triggers = '0,0' unless defined((get_load_info())[0]);
539 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
541 if ($one) {
542 queue_one($one);
543 run_queue();
544 exit;
547 if ($all_once) {
548 queue_all();
549 run_queue();
550 exit;
554 if (run_perpetually() eq "restart") {
555 error("Restarting in response to restart request... ");
556 $reexec->reexec($same_pid);
557 error("Continuing after failed restart: $!");
558 chdir "/";
559 redo;
563 ########## Documentation {{{1
565 __END__
567 =head1 NAME
569 jobd.pl - Perform Girocco maintenance jobs
571 =head1 SYNOPSIS
573 jobd.pl [options]
575 Options:
576 -h | --help detailed instructions
577 -q | --quiet run quietly
578 -P | --progress show occasional status updates
579 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
580 -p NUM | --max-parallel NUM how many jobs to run at the same time
581 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
582 at the same time
583 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
584 TRIG and resume at load below UNTRIG
585 -d NUM | --restart-delay SECONDS wait for this many seconds between
586 queue runs
587 -l FILE | --lockfile FILE create a lockfile in the given
588 location
589 --same-pid keep same pid during graceful restart
590 -a | --all-once process the list only once
591 -o PRJNAME | --one PRJNAME process only one project
593 =head1 OPTIONS
595 =over 8
597 =item B<--help>
599 Print the full description of jobd.pl's options.
601 =item B<--quiet>
603 Suppress non-error messages, e.g. for use when running this task as a cronjob.
605 =item B<--progress>
607 Show information about the current status of the job queue occasionally. This
608 is automatically enabled if --quiet is not given.
610 =item B<--kill-after SECONDS>
612 Kill supervised jobs after a certain time to avoid hanging the daemon.
614 =item B<--max-parallel NUM>
616 Run no more than that many jobs at the same time. The default is the number
617 of cpus * 2. If the number of cpus cannot be determined, the default is 8.
619 =item B<--max-intensive-parallel NUM>
621 Run no more than that many resource-hungry jobs at the same time. Right now,
622 this refers to repacking jobs. The default is 1.
624 =item B<--load-triggers TRIG,UNTRIG>
626 If the first system load average (1 minute average) exceeds TRIG, don't queue
627 any more jobs until it goes below UNTRIG. This is currently only supported on
628 Linux and any other platforms that provide an uptime command with load average
629 output.
631 If both values are zero, load checks are disabled. The default is the number
632 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
633 be determined, the default is 6,3.
635 =item B<--restart-delay NUM>
637 After processing the queue, wait this many seconds until the queue is
638 restarted. The default is 300 seconds.
640 =item B<--lockfile FILE>
642 For perpetual operation, specify the full path to a lock file to create and
643 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
644 where $suffix is a 6-character string uniquely determined by the name and
645 nickname of this Girocco instance. The pid of the running jobd instance will
646 be written to the lock file.
648 =item B<--same-pid>
650 When performing a graceful restart, keep the same pid rather than switching to
651 a new one.
653 =item B<--all-once>
655 Instead of perpetually processing all projects over and over again, process
656 them just once and then exit.
658 =item B<--one PRJNAME>
660 Process only the given project (given as just the project name without C<.git>
661 suffix) and then exit.
663 =back
665 =head1 DESCRIPTION
667 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
668 all the repositories and updates mirrored repositories and repacks push-mode
669 repositories when needed.
671 =cut