From d464f6523e7141f7c9ae0d724579ba3f1e5da05c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jan=20Kr=C3=BCger?= Date: Wed, 24 Nov 2010 18:42:39 +0100 Subject: [PATCH] jobd: add load checks and load-based pausing with --load-triggers MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Introduce new option to enable load checks (currently on Linux only): starting new jobs will be suspended if system load goes above a certain threshold, and it will be resumed once system load drops below a separate threshold. This allows the system to recover from high loads more easily. Signed-off-by: Jan Krüger --- jobd/jobd.pl | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/jobd/jobd.pl b/jobd/jobd.pl index 70c2e88..f41b7b2 100755 --- a/jobd/jobd.pl +++ b/jobd/jobd.pl @@ -21,11 +21,14 @@ my $progress; my $kill_after = 900; my $max_par = 20; my $max_par_intensive = 1; +my @load_triggers = (10,2); my $lockfile = "/tmp/jobd.lock"; my $restart_delay = 60; my $all_once; my $one; +my ($load_trig, $load_untrig); + ######### Jobs {{{1 sub update_project { @@ -250,6 +253,10 @@ sub ts { sub run_queue { my $last_progress = time; + my $last_checkload = time - 5; + my $current_load = $load_trig; + my $overloaded = 0; + my $load_info = ''; $jobs_executed = 0; $jobs_skipped = 0; @jobs_killed = (); @@ -261,9 +268,29 @@ sub run_queue { while (@queue || @running) { reap_hanging_jobs(); my $proceed_immediately = reap_finished_jobs(); + # Check current system load + if ($load_trig && (time - $last_checkload) >= 5 && open(LOADAV, '<', '/proc/loadavg')) { + my $loadinfo = ; + close LOADAV; + my @loadinfo = split(/\s/, $loadinfo); + my $current_load = $loadinfo[0]; + if ($current_load > $load_trig && !$overloaded) { + $overloaded = 1; + error("PAUSE: system load is at $current_load > $load_trig") if $progress; + } elsif ($current_load < $load_untrig && $overloaded) { + $overloaded = 0; + error("RESUME: system load is at $current_load < $load_untrig") if $progress; + } + if ($overloaded) { + $load_info = ', paused (load '. $current_load .')'; + } else { + $load_info = ', load '. $current_load; + } + $last_checkload = time; + } # Status output if ($progress && (time - $last_progress) >= 60) { - ferror("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed)); + ferror("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed)); if (@running) { my @run_status; for (@running) { @@ -274,7 +301,7 @@ sub run_queue { $last_progress = time; } # Back off if we're too busy - if (@running >= $max_par || have_intensive_jobs() >= $max_par_intensive || !@queue) { + if (@running >= $max_par || have_intensive_jobs() >= $max_par_intensive || !@queue || $overloaded) { sleep 1 unless $proceed_immediately; next; } @@ -326,6 +353,7 @@ my $parse_res = GetOptions( 'kill-after|k=i' => \$kill_after, 'max-parallel|p=i' => \$max_par, 'max-intensive-parallel|i=i' => \$max_par_intensive, + 'load-triggers=f{2}' => \@load_triggers, 'restart-delay|d=i' => \$restart_delay, 'lockfile|l=s' => \$lockfile, 'all-once|a' => \$all_once, @@ -339,6 +367,9 @@ unless ($quiet) { $progress = 1; } +@load_triggers = (0, 0) if (!-f '/proc/loadavg'); +($load_trig, $load_untrig) = @load_triggers; + if ($one) { queue_one($one); run_queue(); @@ -373,6 +404,8 @@ jobd [options] -p NUM | --max-parallel NUM how many jobs to run at the same time -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run at the same time + --load-triggers TRIG UNTRIG stop queueing jobs at load above + TRIG and resume at load below UNTRIG -d NUM | --restart-delay SECONDS wait for this many seconds between queue runs -l FILE | --lockfile FILE create a lockfile in the given @@ -410,6 +443,15 @@ Run no more than that many jobs at the same time. Run no more than that many resource-hungry jobs at the same time. Right now, this refers to repacking jobs. +=item B<--load-triggers TRIG UNTRIG> + +If the first system load average (1 minute average) exceeds TRIG, don't queue +any more jobs until it goes below UNTRIG. This is currently only supported on +Linux. + +If both values are zero, load checks are disabled. Note that this is not the +default. + =item B<--restart-delay NUM> After processing the queue, wait this many seconds until the queue is -- 2.11.4.GIT