gc.sh: expand optional arguments with "$@"
[girocco/readme.git] / jobd / jobd.pl
blobac411b2e199145ec6207084da188eaa4a5cc85c9
1 #!/usr/bin/perl
3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
7 use strict;
8 use warnings;
10 use Getopt::Long;
11 use Pod::Usage;
12 use POSIX ":sys_wait_h";
13 use Cwd qw(realpath);
15 use lib "__BASEDIR__";
16 use Girocco::Config;
17 use Girocco::Project;
18 use Girocco::User;
19 use Girocco::Util;
20 BEGIN {noFatalsToBrowser}
21 use Girocco::ExecUtil;
23 # Options
24 my $quiet;
25 my $progress;
26 my $cpus = online_cpus;
27 my $kill_after = 900;
28 my $max_par = $cpus ? $cpus * 2 : 8;
29 my $max_par_intensive = 1;
30 my $load_triggers = $cpus ? sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
31 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
32 my $restart_delay = 300;
33 my $all_once;
34 my $same_pid;
35 my $one;
37 my ($load_trig, $load_untrig);
39 ######### Jobs {{{1
41 sub update_project {
42 my $job = shift;
43 my $p = $job->{'project'};
44 check_project_exists($job) || return;
45 if (-e get_project_path($p).".nofetch" || -e get_project_path($p).".bypass" ||
46 -e get_project_path($p).".bypass_fetch") {
47 job_skip($job);
48 return setup_gc($job);
50 if (-e get_project_path($p).".clone_in_progress" && ! -e get_project_path($p).".clone_failed") {
51 job_skip($job, "initial mirroring not complete yet");
52 return;
54 if (-e get_project_path($p).".clone_failed") {
55 job_skip($job, "initial mirroring failed");
56 # Still need to gc non top-level clones even if they've failed
57 # otherwise the objects copied into them from the parent will
58 # just accumulate without bound
59 setup_gc($job) if $p =~ m,/,;
60 return;
62 if (my $ts = is_operation_uptodate($p, 'lastrefresh', rand_adjust($Girocco::Config::min_mirror_interval))) {
63 job_skip($job, "not needed right now, last run at $ts");
64 setup_gc($job);
65 return;
67 if (is_svn_clone($p)) {
68 # git svn can be very, very slow at times
69 $job->{'timeout_factor'} = 3;
71 exec_job_command($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
74 sub gc_project {
75 my $job = shift;
76 my $p = $job->{'project'};
77 check_project_exists($job) || return;
78 my $projpath = get_project_path($p);
79 if (-e "$projpath.nogc" || -e "$projpath.bypass" ||
80 (-e "$projpath.delaygc" && ! -e "$projpath.allowgc" && ! -e "$projpath.needsgc")) {
81 job_skip($job);
82 return;
84 my $ts;
85 if (! -e "$projpath.needsgc" &&
86 ($ts = is_operation_uptodate($p, 'lastgc', rand_adjust($Girocco::Config::min_gc_interval)))) {
87 job_skip($job, "not needed right now, last run at $ts");
88 return;
90 # allow garbage collection to run for longer than an update
91 $job->{'lastgc'} = get_git_config($projpath, "gitweb.lastgc");
92 $job->{'timeout_factor'} = 2;
93 exec_job_command($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
96 sub setup_gc {
97 my $job = shift;
98 queue_job(
99 project => $job->{'project'},
100 type => 'gc',
101 command => \&gc_project,
102 intensive => 1,
103 on_success => \&maybe_setup_gc_again,
107 sub maybe_setup_gc_again {
108 my $job = shift;
109 # If lastgc was set then gc.sh ran successfully and now it's not set
110 # then queue up another run of gc.sh for the project.
111 # However, just in case, no matter what happens with the extra
112 # gc.sh run no more "bonus" runs are possible to avoid any loops.
113 # This allows a "mini" gc that triggers a full gc to have the
114 # full gc run as part of the same --all-once run through instead
115 # of waiting. A very good thing for users of the --all-once option.
116 if ($job->{'lastgc'}) {
117 my $projpath = get_project_path($job->{'project'});
118 get_git_config($projpath, "gitweb.lastgc") or
119 queue_job(
120 project => $job->{'project'},
121 type => 'gc',
122 command => \&gc_project,
123 intensive => 1,
128 sub check_project_exists {
129 my $job = shift;
130 my $p = $job->{'project'};
131 if (!-d get_project_path($p)) {
132 job_skip($job, "non-existent project");
133 return 0;
138 sub get_project_path {
139 "$Girocco::Config::reporoot/".shift().".git/";
142 my $_last_config_path;
143 my $_last_config_id;
144 my $_last_config;
145 BEGIN {
146 $_last_config_path = "";
147 $_last_config_id = "";
148 $_last_config = {};
151 sub get_git_config {
152 my ($projdir, $name) = @_;
153 defined($projdir) && -d $projdir && -f "$projdir/config" or return undef;
154 my $cf = "$projdir/config";
155 my @stat = stat($cf);
156 @stat && $stat[7] && $stat[9] or return undef;
157 my $id = join(":", $stat[0], $stat[1], $stat[7], $stat[9]); # dev,ino,size,mtime
158 if ($_last_config_path ne $cf || $_last_config_id ne $id || ref($_last_config) ne 'HASH') {
159 my $data = read_config_file_hash($cf);
160 defined($data) or $data = {};
161 $_last_config_path = $_last_config_id = "";
162 $_last_config = $data;
163 $_last_config_id = $id;
164 $_last_config_path = $cf;
166 return $_last_config->{$name};
169 sub is_operation_uptodate {
170 my ($project, $which, $threshold) = @_;
171 my $path = get_project_path($project);
172 my $timestamp = get_git_config($path, "gitweb.$which");
173 defined($timestamp) or $timestamp = '';
174 my $unix_ts = parse_rfc2822_date($timestamp) || 0;
175 (time - $unix_ts) <= $threshold ? $timestamp : undef;
178 sub is_svn_clone {
179 my ($project) = @_;
180 my $path = get_project_path($project);
181 my $baseurl = get_git_config($path, 'gitweb.baseurl');
182 defined($baseurl) or $baseurl = '';
183 my $svnurl = get_git_config($path, 'svn-remote.svn.url');
184 defined($svnurl) or $svnurl = '';
185 return $baseurl =~ /^svn[:+]/i && $svnurl;
188 sub queue_one {
189 my $project = shift;
190 queue_job(
191 project => $project,
192 type => 'update',
193 command => \&update_project,
194 on_success => \&setup_gc,
195 on_error => \&setup_gc,
199 sub queue_all {
200 queue_one($_) for (Girocco::Project->get_full_list());
203 ######### Daemon operation {{{1
205 my @queue;
206 my @running;
207 my $perpetual = 1;
208 my $locked = 0;
209 my $jobs_executed;
210 my $jobs_skipped;
211 my @jobs_killed;
213 # Kills and reaps the specified pid. Returns exit status ($?) on success
214 # otherwise undef if process could not be killed or reaped
215 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
216 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
217 # advantage of "tee -i" in our update scripts and really anything we're killing
218 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
219 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
220 sub kill_gently {
221 my $targ = shift;
222 my $use_pg = shift || 0;
223 # Note that the docs for Perl's kill state that a negative signal
224 # number should be used to kill process groups and that while a
225 # a negative process id (and positive signal number) may also do that
226 # on some platforms, that's not portable.
227 my $pg = $use_pg ? -1 : 1;
228 my $harsh = time() + 15; # SIGKILL after this delay
229 my $count = kill(2*$pg, $targ); # SIGINT is 2
230 my $reaped = waitpid($targ, WNOHANG);
231 return undef if $reaped < 0;
232 return $? if $reaped == $targ;
233 while ($count && time() < $harsh) {
234 select(undef, undef, undef, 0.2);
235 $reaped = waitpid($targ, WNOHANG);
236 return undef if $reaped < 0;
237 return $? if $reaped == $targ;
239 $harsh = time() + 2;
240 $count = kill(9*$pg, $targ); # SIGKILL is 9
241 $reaped = waitpid($targ, WNOHANG);
242 return undef if $reaped < 0;
243 return $? if $reaped == $targ;
244 # We should not need to wait to reap a SIGKILL, however, just in case
245 # the system doesn't make a SIGKILL'd process immediately reapable
246 # (perhaps under extremely heavy load) we accomodate a brief delay
247 while ($count && time() < $harsh) {
248 select(undef, undef, undef, 0.2);
249 $reaped = waitpid($targ, WNOHANG);
250 return undef if $reaped < 0;
251 return $? if $reaped == $targ;
253 return undef;
256 sub handle_softexit {
257 error("Waiting for outstanding jobs to finish... ".
258 "^C again to exit immediately");
259 @queue = ();
260 $perpetual = 0;
261 $SIG{'INT'} = \&handle_exit;
264 sub handle_exit {
265 error("Killing outstanding jobs, please be patient...");
266 $SIG{'TERM'} = 'IGNORE';
267 for (@running) {
268 kill_gently($_->{'pid'}, 1);
270 unlink $lockfile if ($locked);
271 exit(0);
274 sub queue_job {
275 my %opts = @_;
276 $opts{'queued_at'} = time;
277 $opts{'dont_run'} = 0;
278 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
279 push @queue, \%opts;
282 sub run_job {
283 my $job = shift;
285 push @running, $job;
286 $job->{'command'}->($job);
287 if ($job->{'dont_run'}) {
288 pop @running;
289 $jobs_skipped++;
290 return;
294 sub _job_name {
295 my $job = shift;
296 "[".$job->{'type'}."::".$job->{'project'}."]";
299 # Only one of those per job!
300 sub exec_job_command {
301 my ($job, $command, $err_only) = @_;
303 my $pid;
304 $job->{'finished'} = 0;
305 delete $job->{'pid'};
306 if (!defined($pid = fork)) {
307 error(_job_name($job) ." Can't fork job: $!");
308 $job->{'finished'} = 1;
309 return;
311 if (!$pid) {
312 # "Prevent" races
313 select(undef, undef, undef, 0.1);
315 open STDIN, '<', '/dev/null' || do {
316 error(_job_name($job) ."Can't read from /dev/null: $!");
317 exit 71; # EX_OSERR
319 if ($err_only) {
320 open STDOUT, '>', '/dev/null' || do {
321 error(_job_name($job) ." Can't write to /dev/null: $!");
322 exit 71; # EX_OSERR
325 # New process group so we can keep track of all of its children
326 if (!defined(POSIX::setpgid(0, 0))) {
327 error(_job_name($job) ." Can't create process group: $!");
328 exit 71; # EX_OSERR
331 exec @$command;
332 # Stop perl from complaining
333 exit 71; # EX_OSERR
335 $job->{'pid'} = $pid;
336 $job->{'started_at'} = time;
339 sub job_skip {
340 my ($job, $msg) = @_;
341 $job->{'dont_run'} = 1;
342 error(_job_name($job) ." Skipping job: $msg") unless $quiet || !$msg;
345 sub reap_hanging_jobs {
346 for (@running) {
347 my $factor = $_->{'timeout_factor'} || 1;
348 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
349 $_->{'finished'} = 1;
350 my $exitcode = kill_gently($_->{'pid'}, 1);
351 delete $_->{'pid'};
352 $_->{'killed'} = 1;
353 error(_job_name($_) ." KILLED due to timeout" .
354 (($exitcode & 0x7f) == 9 ? " with SIGKILL": ""));
355 push @jobs_killed, _job_name($_);
360 sub reap_one_job {
361 my $job = shift;
362 if (!$job->{'finished'}) {
363 $job->{'on_success'}->($job) if defined($job->{'on_success'});
364 $job->{'finished'} = 1;
365 $jobs_executed++;
366 } else {
367 $job->{'on_error'}->($job) if defined($job->{'on_error'});
371 sub reap_finished_jobs {
372 my $pid;
373 my $finished_any = 0;
374 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
375 delete $child->{'killed'};
376 reap_one_job($child);
377 $finished_any = 1;
379 while (1) {
380 $pid = waitpid(-1, WNOHANG);
381 last if $pid <= 0;
382 $finished_any = 1;
384 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
385 if ($?) {
386 # any non-zero exit status should trigger on_error
387 $child[0]->{'finished'} = 1 if @child;
389 if (@child) {
390 delete $child[0]->{'pid'};
391 reap_one_job($child[0]);
394 @running = grep { $_->{'finished'} == 0 } @running;
395 $finished_any;
398 sub have_intensive_jobs {
399 grep { $_->{'intensive'} == 1 } @running;
402 sub ts {
403 "[". scalar(localtime) ."] ";
406 sub get_load_info {
407 if ($^O eq "linux") {
408 # Read /proc/loadavg on Linux
409 open(LOADAV, '<', '/proc/loadavg') or return undef;
410 my $loadinfo = <LOADAV>;
411 close LOADAV;
412 return (split(/\s/, $loadinfo, 4))[0..2];
413 } else {
414 # Read the output of uptime everywhere else (works on Linux too)
415 open(LOADAV, '-|', 'uptime') or return undef;
416 my $loadinfo = <LOADAV>;
417 close LOADAV;
418 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
419 return ($1, $2, $3);
423 sub run_queue {
424 my $last_progress = time;
425 my $last_checkload = time - 5;
426 my $current_load = $load_trig;
427 my $overloaded = 0;
428 my $load_info = '';
429 $jobs_executed = 0;
430 $jobs_skipped = 0;
431 @jobs_killed = ();
432 if ($progress) {
433 my $s = @queue == 1 ? '' : 's';
434 ferror("--- Processing %d queued job$s", scalar(@queue));
436 $SIG{'INT'} = \&handle_softexit;
437 $SIG{'TERM'} = \&handle_exit;
438 while (@queue || @running) {
439 reap_hanging_jobs();
440 my $proceed_immediately = reap_finished_jobs();
441 # Check current system load
442 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info())[0])) {
443 my $current_load = $loadinfo[0];
444 if ($current_load > $load_trig && !$overloaded) {
445 $overloaded = 1;
446 error("PAUSE: system load is at $current_load > $load_trig") if $progress;
447 } elsif ($current_load < $load_untrig && $overloaded) {
448 $overloaded = 0;
449 error("RESUME: system load is at $current_load < $load_untrig") if $progress;
451 if ($overloaded) {
452 $load_info = ', paused (load '. $current_load .')';
453 } else {
454 $load_info = ', load '. $current_load;
456 $last_checkload = time;
458 # Status output
459 if ($progress && (time - $last_progress) >= 60) {
460 ferror("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
461 if (@running) {
462 my @run_status;
463 for (@running) {
464 push @run_status, _job_name($_)." ". (time - $_->{'started_at'}) ."s";
466 error("STATUS: currently running: ". join(', ', @run_status));
468 $last_progress = time;
470 # Back off if we're too busy
471 if (@running >= $max_par || have_intensive_jobs() >= $max_par_intensive || !@queue || $overloaded) {
472 sleep 1 unless $proceed_immediately;
473 next;
475 # Run next
476 run_job(shift(@queue)) if @queue;
478 if ($progress) {
479 my $s = $jobs_executed == 1 ? '' : 's';
480 ferror("--- Queue processed. %d job$s executed, %d skipped, %d killed.", $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
484 sub run_perpetually {
485 if (-e $lockfile) {
486 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.\n";
488 open LOCK, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!\n";
489 print LOCK $$;
490 close LOCK;
491 $locked = 1;
493 my $result = "";
494 while ($perpetual) {
495 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
496 chmod 0640, $lockfile;
497 chmod 0644, $lockfile;
498 # check for restart request
499 open LOCK, '<', $lockfile || die "Lock file '$lockfile' has disappeared!\n";
500 my $request = <LOCK>;
501 close LOCK;
502 chomp $request if defined($request);
503 if (defined($request) && $request eq "restart") {
504 $result = $request;
505 last;
507 queue_all();
508 run_queue();
509 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
511 unlink $lockfile;
512 $locked = 0;
513 return $result;
516 ######### Helpers {{{1
518 sub error($) {
519 print STDERR ts().shift()."\n";
521 sub ferror(@) {
522 error(sprintf($_[0], @_[1..$#_]));
524 sub fatal($) {
525 error(shift);
526 exit 1;
529 ######### Main {{{1
531 my $reexec = Girocco::ExecUtil->new;
532 my $realpath0 = realpath($0);
533 chdir "/";
534 close(DATA) if fileno(DATA);
535 # Parse options
536 Getopt::Long::Configure('bundling');
537 my $parse_res = GetOptions(
538 'help|?|h' => sub {
539 pod2usage(-verbose => 2, -exitval => 0, -input => $realpath0)},
540 'quiet|q' => \$quiet,
541 'progress|P' => \$progress,
542 'kill-after|k=i' => \$kill_after,
543 'max-parallel|p=i' => \$max_par,
544 'max-intensive-parallel|i=i' => \$max_par_intensive,
545 'load-triggers=s' => \$load_triggers,
546 'restart-delay|d=i' => \$restart_delay,
547 'lockfile|l=s' => \$lockfile,
548 'same-pid' => \$same_pid,
549 'all-once|a' => \$all_once,
550 'one|o=s' => \$one,
551 ) || pod2usage(-exitval => 2, -input => $realpath0);
552 fatal("Error: can only use one out of --all-once and --one")
553 if ($all_once && $one);
555 unless ($quiet) {
556 $ENV{'show_progress'} = '1';
557 $progress = 1;
560 $load_triggers = '0,0' unless defined((get_load_info())[0]);
561 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
563 if ($one) {
564 queue_one($one);
565 run_queue();
566 exit;
569 if ($all_once) {
570 queue_all();
571 run_queue();
572 exit;
576 if (run_perpetually() eq "restart") {
577 error("Restarting in response to restart request... ");
578 $reexec->reexec($same_pid);
579 error("Continuing after failed restart: $!");
580 chdir "/";
581 redo;
585 ########## Documentation {{{1
587 __END__
589 =head1 NAME
591 jobd.pl - Perform Girocco maintenance jobs
593 =head1 SYNOPSIS
595 jobd.pl [options]
597 Options:
598 -h | --help detailed instructions
599 -q | --quiet run quietly
600 -P | --progress show occasional status updates
601 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
602 -p NUM | --max-parallel NUM how many jobs to run at the same time
603 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
604 at the same time
605 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
606 TRIG and resume at load below UNTRIG
607 -d NUM | --restart-delay SECONDS wait for this many seconds between
608 queue runs
609 -l FILE | --lockfile FILE create a lockfile in the given
610 location
611 --same-pid keep same pid during graceful restart
612 -a | --all-once process the list only once
613 -o PRJNAME | --one PRJNAME process only one project
615 =head1 OPTIONS
617 =over 8
619 =item B<--help>
621 Print the full description of jobd.pl's options.
623 =item B<--quiet>
625 Suppress non-error messages, e.g. for use when running this task as a cronjob.
627 =item B<--progress>
629 Show information about the current status of the job queue occasionally. This
630 is automatically enabled if --quiet is not given.
632 =item B<--kill-after SECONDS>
634 Kill supervised jobs after a certain time to avoid hanging the daemon.
636 =item B<--max-parallel NUM>
638 Run no more than that many jobs at the same time. The default is the number
639 of cpus * 2. If the number of cpus cannot be determined, the default is 8.
641 =item B<--max-intensive-parallel NUM>
643 Run no more than that many resource-hungry jobs at the same time. Right now,
644 this refers to repacking jobs. The default is 1.
646 =item B<--load-triggers TRIG,UNTRIG>
648 If the first system load average (1 minute average) exceeds TRIG, don't queue
649 any more jobs until it goes below UNTRIG. This is currently only supported on
650 Linux and any other platforms that provide an uptime command with load average
651 output.
653 If both values are zero, load checks are disabled. The default is the number
654 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
655 be determined, the default is 6,3.
657 =item B<--restart-delay NUM>
659 After processing the queue, wait this many seconds until the queue is
660 restarted. The default is 300 seconds.
662 =item B<--lockfile FILE>
664 For perpetual operation, specify the full path to a lock file to create and
665 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
666 where $suffix is a 6-character string uniquely determined by the name and
667 nickname of this Girocco instance. The pid of the running jobd instance will
668 be written to the lock file.
670 =item B<--same-pid>
672 When performing a graceful restart, keep the same pid rather than switching to
673 a new one.
675 =item B<--all-once>
677 Instead of perpetually processing all projects over and over again, process
678 them just once and then exit.
680 =item B<--one PRJNAME>
682 Process only the given project (given as just the project name without C<.git>
683 suffix) and then exit.
685 =back
687 =head1 DESCRIPTION
689 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
690 all the repositories and updates mirrored repositories and repacks push-mode
691 repositories when needed.
693 =cut