jobd: increase timeout to 15 minutes
[girocco.git] / jobd / jobd.pl
blobc0ecc5200c72cf501c0dd24e3668c4f63031a68c
1 #!/usr/bin/perl
3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
7 use strict;
8 use warnings;
10 use Getopt::Long;
11 use Pod::Usage;
13 use Girocco::Config;
14 use Girocco::Project;
15 use Girocco::User;
17 # Options
18 my $quiet;
19 my $kill_after = 900;
20 my $max_par = 3;
21 my $lockfile = "/tmp/jobd.lock";
22 my $all_once;
23 my $one;
25 ######### Jobs {{{1
27 sub update_project {
28 my $job = shift;
29 my $p = $job->{'project'};
30 check_project_exists($job) || return;
31 (-e "$Girocco::Config::reporoot/$p.git/.nofetch") || do {
32 job_skip($job);
33 return;
35 exec_job_command($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
38 sub gc_project {
39 my $job = shift;
40 my $p = $job->{'project'};
41 check_project_exists($job) || return;
42 exec_job_command($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
45 sub setup_gc {
46 my $job = shift;
47 queue_job(
48 project => $job->{'project'},
49 type => 'gc',
50 command => \&gc_project,
54 sub check_project_exists {
55 my $job = shift;
56 my $p = $job->{'project'};
57 if (!-d "$Girocco::Config::reporoot/$p.git") {
58 error("Warning: skipping non-existent project: $job->{project}")
59 unless $quiet;
60 job_skip();
61 return 0;
66 sub queue_one {
67 my $project = shift;
68 queue_job(
69 project => $project,
70 type => 'update',
71 command => \&update_project,
72 on_success => \&setup_gc,
73 on_error => \&setup_gc,
77 sub queue_all {
78 queue_one($_) for (Girocco::Project->get_full_list());
81 ######### Daemon operation {{{1
83 my @queue;
84 my @running;
85 my $perpetual = 1;
86 my $locked = 0;
87 my $jobs_executed;
88 my @jobs_killed;
90 sub handle_softexit {
91 error("Waiting for outstanding jobs to finish... ".
92 "^C again to exit immediately");
93 @queue = ();
94 $perpetual = 0;
95 $SIG{'INT'} = \&handle_exit;
98 sub handle_exit {
99 error("Killing outstanding jobs...");
100 $SIG{'CHLD'} = 'IGNORE';
101 $SIG{'TERM'} = 'IGNORE';
102 for (@running) {
103 kill 'KILL', $_->{'pid'};
105 unlink $lockfile if ($locked);
106 exit(0);
109 sub handle_childgone {
110 my $pid = wait;
111 if ($pid != -1) {
112 my $child = grep { $_->{'pid'} == $pid } @running;
113 if ($?) {
114 # XXX- we currently don't care
116 $child->{'finished'} = 2;
117 $jobs_executed++;
119 # Just to be safe
120 $SIG{'CHLD'} = \&handle_childgone;
123 sub queue_job {
124 my %opts = @_;
125 $opts{'queued_at'} = time;
126 push @queue, \%opts;
129 sub run_job {
130 my $job = shift;
132 push @running, $job;
133 $job->{'command'}->($job);
136 sub _job_name {
137 my $job = shift;
138 "[".$job->{'type'}."::".$job->{'project'}."]";
141 # Only one of those per job!
142 sub exec_job_command {
143 my ($job, $command, $err_only) = @_;
145 my $pid;
146 if (!defined($pid = fork)) {
147 error(_job_name($job) ." Can't fork job: $!");
148 $job->{'finished'} = 1;
149 return;
151 if (!$pid) {
152 if ($err_only) {
153 open STDOUT, '>/dev/null' || do {
154 error(_job_name($job) ." Can't write to /dev/null: $!");
155 $job->{'finished'} = 1;
156 return;
159 exec @$command;
160 exit $?;
162 $job->{'pid'} = $pid;
163 $job->{'finished'} = 0;
164 $job->{'started_at'} = time;
167 sub job_skip {
168 my $job = shift;
169 exec_job_command($job, ['/bin/false']);
172 sub reap_hanging_jobs {
173 for (@running) {
174 if ((time - $_->{'started_at'}) > $kill_after) {
175 $_->{'finished'} = 1;
176 kill 'KILL', $_->{'pid'};
177 print STDERR _job_name($_) ." KILLED due to timeout\n";
178 push @jobs_killed, _job_name($_);
183 sub reap_finished_jobs {
184 for (@running) {
185 my $status = $_->{'finished'};
186 if ($status == 0) { next; }
187 elsif ($status == 1 && defined($_->{'on_error'})) {
188 $_->{'on_error'}->($_);
189 } elsif ($status == 2 && defined($_->{'on_success'})) {
190 $_->{'on_success'}->($_);
193 @running = grep { $_->{'finished'} == 0 } @running;
196 sub run_queue {
197 my $queue_steps = 0;
198 $jobs_executed = 0;
199 @jobs_killed = ();
200 unless ($quiet) {
201 printf STDERR "--- Processing %d queued jobs\n", @queue;
203 while (@queue || @running) {
204 reap_hanging_jobs();
205 reap_finished_jobs();
206 # Back off if we're too busy
207 if (@running >= $max_par) {
208 sleep 10;
209 $queue_steps++;
210 unless ($quiet || ($queue_steps % 10)) {
211 printf STDERR "STATUS: %d queued, %d running, %d finished, %d killed\n", @queue, @running, $jobs_executed, @jobs_killed;
213 last;
215 # Run next
216 run_job(shift(@queue)) if @queue;
218 unless ($quiet) {
219 printf STDERR "--- Queue processed. %d jobs executed, %d killed due to timeouts. Now restarting.\n", @queue, @jobs_killed;
223 sub run_perpetually {
224 if (-e $lockfile) {
225 die "Lockfile exists. Please make sure no other instance of jobd is running.";
227 open LOCK, '>', $lockfile || die "Cannot create lockfile $lockfile: $!";
228 print LOCK $$;
229 close LOCK;
230 $locked = 1;
232 while ($perpetual) {
233 queue_all();
234 run_queue();
236 unlink $lockfile;
239 ######### Helpers {{{1
241 sub error($) {
242 print STDERR shift()."\n";
244 sub fatal($) {
245 error(shift);
246 exit 1;
249 ######### Main {{{1
251 # Parse options
252 Getopt::Long::Configure('bundling', 'auto_help');
253 my $parse_res = GetOptions(
254 'quiet|q' => \$quiet,
255 'kill-after|k=i' => \$kill_after,
256 'max-parallel|p=i' => \$max_par,
257 'lockfile|l=s' => \$lockfile,
258 'all-once|a' => \$all_once,
259 'one|o=s' => \$one,
260 ) || pod2usage(2);
261 fatal("Error: can only use one out of --all-once and --one")
262 if ($all_once && $one);
264 unless ($quiet) {
265 $ENV{'show_progress'} = '1';
268 if ($one) {
269 queue_one($one);
270 run_queue();
271 exit;
274 if ($all_once) {
275 queue_all();
276 run_queue();
277 exit;
280 run_perpetually();
282 ########## Documentation {{{1
284 __END__
286 =head1 NAME
288 jobd - Perform Girocco maintenance jobs
290 =head1 SYNOPSIS
292 jobd [options]
294 Options:
295 -h | --help detailed instructions
296 -q | --quiet run quietly
297 -k SECONDS | --kill-after=SECONDS how long to wait before killing jobs
298 -p NUM | --max-parallel=NUM how many jobs to run at the same time
299 -l FILE | --lockfile=FILE create a lockfile in the given location
300 -a | --all-once process the list only once
301 -o PRJNAME | --one=PRJNAME process only one project
303 =head1 OPTIONS
305 =over 8
307 =item B<--help>
309 Print the full description of jobd's options.
311 =item B<--quiet>
313 Suppress non-error messages, e.g. for use when running this task as a cronjob.
315 =item B<--kill-after=SECONDS>
317 Kill supervised jobs after a certain time to avoid hanging the daemon.
319 =item B<--max-parallel=NUM>
321 Run no more than that many jobs at the same time.
323 =item B<--lockfile=FILE>
325 For perpetual operation, create a lockfile in that place and clean it up after
326 finishing/aborting.
328 =item B<--all-once>
330 Instead of perpetuously processing all projects over and over again, process
331 them just once and then exit.
333 =item B<--one=PRJNAME>
335 Process only the given project (given as just the project name without C<.git>
336 suffix) and then exit.
338 =back
340 =head1 DESCRIPTION
342 jobd is Girocco's repositories maintenance servant; it periodically checks all
343 the repositories and updates mirrored repositories and repacks push-mode
344 repositories when needed.
346 =cut