From f27b83d2a8180e9124251355864264f27b0cd276 Mon Sep 17 00:00:00 2001
From: Tom Werner <tom@tampa.desk.hq.powerset.com>
Date: Wed, 5 Sep 2007 17:18:39 -0700
Subject: [PATCH] fix potential problem with event registration failures

---
 lib/god/conditions/process_exits.rb |  4 +---
 lib/god/conditions/tries.rb         | 18 +++++++--------
 lib/god/errors.rb                   |  3 +++
 lib/god/hub.rb                      | 11 ++++++++-
 site/index.html                     | 23 ++++++++++++++++++-
 test/test_conditions_tries.rb       | 46 +++++++++++++++++++++++++++++++++++++
 6 files changed, 91 insertions(+), 14 deletions(-)
 create mode 100644 test/test_conditions_tries.rb
diff --git a/lib/god/conditions/process_exits.rb b/lib/god/conditions/process_exits.rb
index 591e909..938fb42 100644
--- a/lib/god/conditions/process_exits.rb
+++ b/lib/god/conditions/process_exits.rb
@@ -16,9 +16,7 @@ module God
             Hub.trigger(self)
           end
         rescue StandardError
-          # Rapid consecutive kills of a process using ProcessExits can cause
-          # the registration to fail. Capturing the exception and having the
-          # start->up transition recover is the best current solution - Tom
+          raise EventRegistrationFailedError.new
         end
       end
       
diff --git a/lib/god/conditions/tries.rb b/lib/god/conditions/tries.rb
index cd35e45..176224a 100644
--- a/lib/god/conditions/tries.rb
+++ b/lib/god/conditions/tries.rb
@@ -2,14 +2,10 @@ module God
   module Conditions
     
     class Tries < PollCondition
-      attr_accessor :times
+      attr_accessor :times, :within
       
       def prepare
-        if self.times.kind_of?(Integer)
-          self.times = [self.times, self.times]
-        end
-        
-        @timeline = Timeline.new(self.times[1])
+        @timeline = Timeline.new(self.times)
       end
     
       def valid?
@@ -19,9 +15,13 @@ module God
       end
     
       def test
-        @timeline.push(true)
-        if @timeline.select { |x| x }.size >= self.times.first
-          @timeline.clear
+        @timeline << Time.now
+        
+        concensus = (@timeline.size == self.times)
+        duration = within.nil? || (@timeline.last - @timeline.first) < self.within
+        
+        if concensus && duration
+          @timeline.clear if within.nil?
           return true
         else
           return false
diff --git a/lib/god/errors.rb b/lib/god/errors.rb
index b4a11f0..85263c4 100644
--- a/lib/god/errors.rb
+++ b/lib/god/errors.rb
@@ -15,4 +15,7 @@ module God
   class InvalidCommandError < StandardError
   end
   
+  class EventRegistrationFailedError < StandardError
+  end
+  
 end
\ No newline at end of file
diff --git a/lib/god/hub.rb b/lib/god/hub.rb
index 278d8dc..3a0a9d5 100644
--- a/lib/god/hub.rb
+++ b/lib/god/hub.rb
@@ -80,7 +80,16 @@ module God
             # transition or reschedule
             if dest
               # transition
-              watch.move(dest)
+              begin
+                watch.move(dest)
+              rescue EventRegistrationFailedError
+                msg = watch.name + ' Event registration failed, moving back to previous state'
+                Syslog.debug(msg)
+                LOG.log(watch, :info, msg)
+                
+                dest = watch.state
+                retry
+              end
             else
               # reschedule
               Timer.get.schedule(condition)
diff --git a/site/index.html b/site/index.html
index c63b4b2..a4f0bda 100644
--- a/site/index.html
+++ b/site/index.html
@@ -326,7 +326,7 @@ end</code></pre>
     end</code></pre>
       
 <p>Watches contain conditions grouped by the action to execute should they return <code>true</code>. I start with a <code>start_if</code> block that contains a single condition. Conditions are specified by calling <code>condition</code> with an identifier, in this case 
-<code>:process_not_running</code>. Each condition can specify a poll interval that will override the default watch interval. In this case, I want to check that the process is still running every 5 seconds instead of the 30 second interval that other conditions will inherit. The ability to set condition specific poll intervals makes it possible to run costly tests less often then cheap tests.</p>
+<code>:process_running</code>. Each condition can specify a poll interval that will override the default watch interval. In this case, I want to check that the process is still running every 5 seconds instead of the 30 second interval that other conditions will inherit. The ability to set condition specific poll intervals makes it possible to run costly tests less often then cheap tests.</p>
 
 <pre><code class="ruby">    w.restart_if do |restart|
       restart.condition(:memory_usage) do |c|
@@ -450,6 +450,12 @@ God.watch do |w|
     on.condition(:process_running) do |c|
       c.running = true
     end
+    
+    # failsafe
+    on.condition(:tries) do |c|
+      c.times = 3
+      c.transition = :start
+    end
   end
 
   # start if process is not running
@@ -496,10 +502,25 @@ end
     on.condition(:process_running) do |c|
       c.running = true
     end
+    
+    ...
   end</code></pre>
 
 <p>If god has determined that my process isn't running, the Watch will be put into the <code>start</code> state. Upon entering this state, the <code>start</code> command that I specified on the Watch will be called. In addition, the above transition specifies a condition that should be enabled when in either the <code>start</code> or <code>restart</code> states. The condition is another <code>process_running</code>, however this time I'm only interested in moving to another state once it returns <code>true</code>. A <code>true</code> return from this condition means that the process is running and it's ok to transition to the <code>up</code> state (second argument to <code>transition</code>).</p>
 
+<pre><code class="ruby">  # determine when process has finished starting
+  w.transition([:start, :restart], :up) do |on|
+    ...
+    
+    # failsafe
+    on.condition(:tries) do |c|
+      c.times = 3
+      c.transition = :start
+    end
+  end</code></pre>
+
+<p>The other half of this transition uses the <code>tries</code> condition to ensure that god doesn't get stuck in this state. It's possible that the process could go down while the transition is being made, in which case god would end up polling forever to see if the process is up. Here I've specified that if this condition is called three times, god should override the normal transition destination and move to the <code>start</code> state instead. If you specify a <code>transition</code> attribute on any condition, that state will be transferred to instead of the normal transfer destination.</p>
+
 <pre><code class="ruby">  # start if process is not running
   w.transition(:up, :start) do |on|
     on.condition(:process_exits)
diff --git a/test/test_conditions_tries.rb b/test/test_conditions_tries.rb
new file mode 100644
index 0000000..54f10d7
--- /dev/null
+++ b/test/test_conditions_tries.rb
@@ -0,0 +1,46 @@
+require File.dirname(__FILE__) + '/helper'
+
+class TestConditionsTries < Test::Unit::TestCase
+  def setup
+    @c = Conditions::Tries.new
+    @c.times = 3
+    @c.prepare
+  end
+  
+  def test_prepare_should_create_timeline
+    assert 3, @c.instance_variable_get(:@timeline).instance_variable_get(:@max_size)
+  end
+  
+  def test_test_should_return_true_if_called_three_times_within_one_second
+    assert !@c.test
+    assert !@c.test
+    assert @c.test
+  end
+  
+  def test_test_should_return_false_on_fourth_call_if_called_three_times_within_one_second
+    3.times { @c.test }
+    assert !@c.test
+  end
+end
+
+class TestConditionsTriesWithin < Test::Unit::TestCase
+  def setup
+    @c = Conditions::Tries.new
+    @c.times = 3
+    @c.within = 1.seconds
+    @c.prepare
+  end
+  
+  def test_test_should_return_true_if_called_three_times_within_one_second
+    assert !@c.test
+    assert !@c.test
+    assert @c.test
+  end
+  
+  def test_test_should_return_false_if_called_three_times_within_two_seconds
+    assert !@c.test
+    assert !@c.test
+    assert sleep(1)
+    assert !@c.test
+  end
+end
\ No newline at end of file
-- 
2.11.4.GIT