Parcourir la source

Merge branch 'trac1342'

Jelte Jansen il y a 13 ans
Parent
commit
13f108b25f

+ 21 - 84
src/bin/bind10/bind10_src.py.in

@@ -92,51 +92,6 @@ VERSION = "bind10 20110223 (BIND 10 @PACKAGE_VERSION@)"
 # This is for boot_time of Boss
 # This is for boot_time of Boss
 _BASETIME = time.gmtime()
 _BASETIME = time.gmtime()
 
 
-class RestartSchedule:
-    """
-Keeps state when restarting something (in this case, a process).
-
-When a process dies unexpectedly, we need to restart it. However, if 
-it fails to restart for some reason, then we should not simply keep
-restarting it at high speed.
-
-A more sophisticated algorithm can be developed, but for now we choose
-a simple set of rules:
-
-  * If a process was been running for >=10 seconds, we restart it
-    right away.
-  * If a process was running for <10 seconds, we wait until 10 seconds
-    after it was started.
-
-To avoid programs getting into lockstep, we use a normal distribution
-to avoid being restarted at exactly 10 seconds."""
-
-    def __init__(self, restart_frequency=10.0):
-        self.restart_frequency = restart_frequency
-        self.run_start_time = None
-        self.run_stop_time = None
-        self.restart_time = None
-    
-    def set_run_start_time(self, when=None):
-        if when is None:
-            when = time.time()
-        self.run_start_time = when
-        sigma = self.restart_frequency * 0.05
-        self.restart_time = when + random.normalvariate(self.restart_frequency, 
-                                                        sigma)
-
-    def set_run_stop_time(self, when=None):
-        """We don't actually do anything with stop time now, but it 
-        might be useful for future algorithms."""
-        if when is None:
-            when = time.time()
-        self.run_stop_time = when
-
-    def get_restart_time(self, when=None):
-        if when is None:
-            when = time.time()
-        return max(when, self.restart_time)
-
 class ProcessInfoError(Exception): pass
 class ProcessInfoError(Exception): pass
 
 
 class ProcessInfo:
 class ProcessInfo:
@@ -151,7 +106,6 @@ class ProcessInfo:
         self.env = env
         self.env = env
         self.dev_null_stdout = dev_null_stdout
         self.dev_null_stdout = dev_null_stdout
         self.dev_null_stderr = dev_null_stderr
         self.dev_null_stderr = dev_null_stderr
-        self.restart_schedule = RestartSchedule()
         self.uid = uid
         self.uid = uid
         self.username = username
         self.username = username
         self.process = None
         self.process = None
@@ -200,7 +154,6 @@ class ProcessInfo:
                                         env=spawn_env,
                                         env=spawn_env,
                                         preexec_fn=self._preexec_work)
                                         preexec_fn=self._preexec_work)
         self.pid = self.process.pid
         self.pid = self.process.pid
-        self.restart_schedule.set_run_start_time()
 
 
     # spawn() and respawn() are the same for now, but in the future they
     # spawn() and respawn() are the same for now, but in the future they
     # may have different functionality
     # may have different functionality
@@ -240,8 +193,6 @@ class BoB:
         self.cc_session = None
         self.cc_session = None
         self.ccs = None
         self.ccs = None
         self.curproc = None
         self.curproc = None
-        # XXX: Not used now, waits for reintroduction of restarts.
-        self.dead_processes = {}
         self.msgq_socket_file = msgq_socket_file
         self.msgq_socket_file = msgq_socket_file
         self.nocache = nocache
         self.nocache = nocache
         self.component_config = {}
         self.component_config = {}
@@ -250,6 +201,9 @@ class BoB:
         # inapropriate. But as the code isn't probably completely ready
         # inapropriate. But as the code isn't probably completely ready
         # for it, we leave it at components for now.
         # for it, we leave it at components for now.
         self.components = {}
         self.components = {}
+        # Simply list of components that died and need to wait for a
+        # restart. Components manage their own restart schedule now
+        self.components_to_restart = []
         self.runnable = False
         self.runnable = False
         self.uid = setuid
         self.uid = setuid
         self.username = username
         self.username = username
@@ -821,7 +775,11 @@ class BoB:
                     # Tell it it failed. But only if it matters (we are
                     # Tell it it failed. But only if it matters (we are
                     # not shutting down and the component considers itself
                     # not shutting down and the component considers itself
                     # to be running.
                     # to be running.
-                    component.failed(exit_status);
+                    component_restarted = component.failed(exit_status);
+                    # if the process wants to be restarted, but not just yet,
+                    # it returns False
+                    if not component_restarted:
+                        self.components_to_restart.append(component)
             else:
             else:
                 logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
                 logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
 
 
@@ -837,39 +795,22 @@ class BoB:
             timeout value.
             timeout value.
 
 
         """
         """
-        # TODO: This is an artefact of previous way of handling processes. The
-        # restart queue is currently empty at all times, so this returns None
-        # every time it is called (thought is a relict that is obviously wrong,
-        # it is called and it doesn't hurt).
-        #
-        # It is preserved for archeological reasons for the time when we return
-        # the delayed restarts, most of it might be useful then (or, if it is
-        # found useless, removed).
-        next_restart = None
-        # if we're shutting down, then don't restart
         if not self.runnable:
         if not self.runnable:
             return 0
             return 0
-        # otherwise look through each dead process and try to restart
-        still_dead = {}
+        still_dead = []
+        # keep track of the first time we need to check this queue again,
+        # if at all
+        next_restart_time = None
         now = time.time()
         now = time.time()
-        for proc_info in self.dead_processes.values():
-            restart_time = proc_info.restart_schedule.get_restart_time(now)
-            if restart_time > now:
-                if (next_restart is None) or (next_restart > restart_time):
-                    next_restart = restart_time
-                still_dead[proc_info.pid] = proc_info
-            else:
-                logger.info(BIND10_RESURRECTING_PROCESS, proc_info.name)
-                try:
-                    proc_info.respawn()
-                    self.components[proc_info.pid] = proc_info
-                    logger.info(BIND10_RESURRECTED_PROCESS, proc_info.name, proc_info.pid)
-                except:
-                    still_dead[proc_info.pid] = proc_info
-        # remember any processes that refuse to be resurrected
-        self.dead_processes = still_dead
-        # return the time when the next process is ready to be restarted
-        return next_restart
+        for component in self.components_to_restart:
+            if not component.restart(now):
+                still_dead.append(component)
+                if next_restart_time is None or\
+                   next_restart_time > component.get_restart_time():
+                    next_restart_time = component.get_restart_time()
+        self.components_to_restart = still_dead
+
+        return next_restart_time
 
 
 # global variables, needed for signal handlers
 # global variables, needed for signal handlers
 options = None
 options = None
@@ -1052,10 +993,6 @@ def main():
     while boss_of_bind.runnable:
     while boss_of_bind.runnable:
         # clean up any processes that exited
         # clean up any processes that exited
         boss_of_bind.reap_children()
         boss_of_bind.reap_children()
-        # XXX: As we don't put anything into the processes to be restarted,
-        # this is really a complicated NOP. But we will try to reintroduce
-        # delayed restarts, so it stays here for now, until we find out if
-        # it's useful.
         next_restart = boss_of_bind.restart_processes()
         next_restart = boss_of_bind.restart_processes()
         if next_restart is None:
         if next_restart is None:
             wait_time = None
             wait_time = None

+ 0 - 2
src/bin/bind10/tests/bind10_test.py.in

@@ -105,7 +105,6 @@ class TestBoB(unittest.TestCase):
         self.assertEqual(bob.cc_session, None)
         self.assertEqual(bob.cc_session, None)
         self.assertEqual(bob.ccs, None)
         self.assertEqual(bob.ccs, None)
         self.assertEqual(bob.components, {})
         self.assertEqual(bob.components, {})
-        self.assertEqual(bob.dead_processes, {})
         self.assertEqual(bob.runnable, False)
         self.assertEqual(bob.runnable, False)
         self.assertEqual(bob.uid, None)
         self.assertEqual(bob.uid, None)
         self.assertEqual(bob.username, None)
         self.assertEqual(bob.username, None)
@@ -118,7 +117,6 @@ class TestBoB(unittest.TestCase):
         self.assertEqual(bob.cc_session, None)
         self.assertEqual(bob.cc_session, None)
         self.assertEqual(bob.ccs, None)
         self.assertEqual(bob.ccs, None)
         self.assertEqual(bob.components, {})
         self.assertEqual(bob.components, {})
-        self.assertEqual(bob.dead_processes, {})
         self.assertEqual(bob.runnable, False)
         self.assertEqual(bob.runnable, False)
         self.assertEqual(bob.uid, None)
         self.assertEqual(bob.uid, None)
         self.assertEqual(bob.username, None)
         self.assertEqual(bob.username, None)

+ 53 - 3
src/lib/python/isc/bind10/component.py

@@ -39,6 +39,7 @@ START_CMD = 'start'
 STOP_CMD = 'stop'
 STOP_CMD = 'stop'
 
 
 STARTED_OK_TIME = 10
 STARTED_OK_TIME = 10
+COMPONENT_RESTART_DELAY = 10
 
 
 STATE_DEAD = 'dead'
 STATE_DEAD = 'dead'
 STATE_STOPPED = 'stopped'
 STATE_STOPPED = 'stopped'
@@ -99,11 +100,18 @@ class BaseComponent:
             but it is vital part of the service (like auth server). If
             but it is vital part of the service (like auth server). If
             it fails to start or crashes in less than 10s after the first
             it fails to start or crashes in less than 10s after the first
             startup, the system is brought down. If it crashes later on,
             startup, the system is brought down. If it crashes later on,
-            it is restarted.
+            it is restarted (see below).
           * 'dispensable' means the component should be running, but if it
           * 'dispensable' means the component should be running, but if it
             doesn't start or crashes for some reason, the system simply tries
             doesn't start or crashes for some reason, the system simply tries
             to restart it and keeps running.
             to restart it and keeps running.
 
 
+        For components that are restarted, the restarts are not always
+        immediate; if the component has run for more than
+        COMPONENT_RESTART_DELAY (10) seconds, they are restarted right
+        away. If the component has not run that long, the system waits
+        until that time has passed (since the last start) until the
+        component is restarted.
+
         Note that the __init__ method of child class should have these
         Note that the __init__ method of child class should have these
         parameters:
         parameters:
 
 
@@ -134,6 +142,7 @@ class BaseComponent:
         self.__state = STATE_STOPPED
         self.__state = STATE_STOPPED
         self._kind = kind
         self._kind = kind
         self._boss = boss
         self._boss = boss
+        self._original_start_time = None
 
 
     def start(self):
     def start(self):
         """
         """
@@ -149,6 +158,9 @@ class BaseComponent:
         logger.info(BIND10_COMPONENT_START, self.name())
         logger.info(BIND10_COMPONENT_START, self.name())
         self.__state = STATE_RUNNING
         self.__state = STATE_RUNNING
         self.__start_time = time.time()
         self.__start_time = time.time()
+        if self._original_start_time is None:
+            self._original_start_time = self.__start_time
+        self._restart_time = None
         try:
         try:
             self._start_internal()
             self._start_internal()
         except Exception as e:
         except Exception as e:
@@ -188,6 +200,11 @@ class BaseComponent:
         The exit code is used for logging. It might be None.
         The exit code is used for logging. It might be None.
 
 
         It calls _failed_internal internally.
         It calls _failed_internal internally.
+
+        Returns True if the process was immediately restarted, returns
+                False is the process was not restarted, either because
+                it is considered a core or needed component, or because
+                the component is to be restarted later.
         """
         """
         logger.error(BIND10_COMPONENT_FAILED, self.name(), self.pid(),
         logger.error(BIND10_COMPONENT_FAILED, self.name(), self.pid(),
                      exit_code if exit_code is not None else "unknown")
                      exit_code if exit_code is not None else "unknown")
@@ -199,14 +216,47 @@ class BaseComponent:
         # (including it stopped really soon)
         # (including it stopped really soon)
         if self._kind == 'core' or \
         if self._kind == 'core' or \
             (self._kind == 'needed' and time.time() - STARTED_OK_TIME <
             (self._kind == 'needed' and time.time() - STARTED_OK_TIME <
-             self.__start_time):
+             self._original_start_time):
             self.__state = STATE_DEAD
             self.__state = STATE_DEAD
             logger.fatal(BIND10_COMPONENT_UNSATISFIED, self.name())
             logger.fatal(BIND10_COMPONENT_UNSATISFIED, self.name())
             self._boss.component_shutdown(1)
             self._boss.component_shutdown(1)
+            return False
         # This means we want to restart
         # This means we want to restart
         else:
         else:
-            logger.warn(BIND10_COMPONENT_RESTART, self.name())
+            # if the component was only running for a short time, don't
+            # restart right away, but set a time it wants to restarted,
+            # and return that it wants to be restarted later
+            self.set_restart_time()
+            return self.restart()
+
+    def set_restart_time(self):
+        """Calculates and sets the time this component should be restarted.
+           Currently, it uses a very basic algorithm; start time +
+           RESTART_DELAY (10 seconds). This algorithm may be improved upon
+           in the future.
+        """
+        self._restart_at = self.__start_time + COMPONENT_RESTART_DELAY
+
+    def get_restart_time(self):
+        """Returns the time at which this component should be restarted."""
+        return self._restart_at
+
+    def restart(self, now = None):
+        """Restarts the component if it has a restart_time and if the value
+           of the restart_time is smaller than 'now'.
+
+           If the parameter 'now' is given, its value will be used instead
+           of calling time.time().
+
+           Returns True if the component is restarted, False if not."""
+        if now is None:
+            now = time.time()
+        if self.get_restart_time() is not None and\
+           self.get_restart_time() < now:
             self.start()
             self.start()
+            return True
+        else:
+            return False
 
 
     def running(self):
     def running(self):
         """
         """

+ 100 - 19
src/lib/python/isc/bind10/tests/component_test.py

@@ -221,11 +221,6 @@ class ComponentTests(BossUtils, unittest.TestCase):
         """
         """
         Check the component restarted successfully.
         Check the component restarted successfully.
 
 
-        Currently, it is implemented as starting it again right away. This will
-        change, it will register itself into the restart schedule in boss. But
-        as the integration with boss is not clear yet, we don't know how
-        exactly that will happen.
-
         Reset the self.__start_called to False before calling the function when
         Reset the self.__start_called to False before calling the function when
         the component should fail.
         the component should fail.
         """
         """
@@ -237,6 +232,16 @@ class ComponentTests(BossUtils, unittest.TestCase):
         # Check it can't be started again
         # Check it can't be started again
         self.assertRaises(ValueError, component.start)
         self.assertRaises(ValueError, component.start)
 
 
+    def __check_not_restarted(self, component):
+        """
+        Check the component has not (yet) restarted successfully.
+        """
+        self.assertFalse(self._shutdown)
+        self.assertTrue(self.__start_called)
+        self.assertFalse(self.__stop_called)
+        self.assertTrue(self.__failed_called)
+        self.assertFalse(component.running())
+
     def __do_start_stop(self, kind):
     def __do_start_stop(self, kind):
         """
         """
         This is a body of a test. It creates a component of given kind,
         This is a body of a test. It creates a component of given kind,
@@ -296,7 +301,9 @@ class ComponentTests(BossUtils, unittest.TestCase):
         component.start()
         component.start()
         self.__check_started(component)
         self.__check_started(component)
         # Pretend the component died
         # Pretend the component died
-        component.failed(1)
+        restarted = component.failed(1)
+        # Since it is a core component, it should not be restarted
+        self.assertFalse(restarted)
         # It should bring down the whole server
         # It should bring down the whole server
         self.__check_dead(component)
         self.__check_dead(component)
 
 
@@ -312,7 +319,9 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.__check_started(component)
         self.__check_started(component)
         self._timeskip()
         self._timeskip()
         # Pretend the component died some time later
         # Pretend the component died some time later
-        component.failed(1)
+        restarted = component.failed(1)
+        # Should not be restarted
+        self.assertFalse(restarted)
         # Check the component is still dead
         # Check the component is still dead
         self.__check_dead(component)
         self.__check_dead(component)
 
 
@@ -328,7 +337,9 @@ class ComponentTests(BossUtils, unittest.TestCase):
         component.start()
         component.start()
         self.__check_started(component)
         self.__check_started(component)
         # Make it fail right away.
         # Make it fail right away.
-        component.failed(1)
+        restarted = component.failed(1)
+        # Should not have restarted
+        self.assertFalse(restarted)
         self.__check_dead(component)
         self.__check_dead(component)
 
 
     def test_start_fail_needed_later(self):
     def test_start_fail_needed_later(self):
@@ -344,37 +355,65 @@ class ComponentTests(BossUtils, unittest.TestCase):
         # Make it fail later on
         # Make it fail later on
         self.__start_called = False
         self.__start_called = False
         self._timeskip()
         self._timeskip()
-        component.failed(1)
+        restarted = component.failed(1)
+        # Should have restarted
+        self.assertTrue(restarted)
         self.__check_restarted(component)
         self.__check_restarted(component)
 
 
     def test_start_fail_dispensable(self):
     def test_start_fail_dispensable(self):
         """
         """
-        Start and then fail a dispensable component. Should just get restarted.
+        Start and then fail a dispensable component. Should not get restarted.
         """
         """
         # Just ordinary startup
         # Just ordinary startup
-        component = self.__create_component('needed')
+        component = self.__create_component('dispensable')
         self.__check_startup(component)
         self.__check_startup(component)
         component.start()
         component.start()
         self.__check_started(component)
         self.__check_started(component)
         # Make it fail right away
         # Make it fail right away
-        self.__start_called = False
-        component.failed(1)
-        self.__check_restarted(component)
+        restarted = component.failed(1)
+        # Should signal that it did not restart
+        self.assertFalse(restarted)
+        self.__check_not_restarted(component)
 
 
-    def test_start_fail_dispensable(self):
+    def test_start_fail_dispensable_later(self):
         """
         """
         Start and then later on fail a dispensable component. Should just get
         Start and then later on fail a dispensable component. Should just get
         restarted.
         restarted.
         """
         """
         # Just ordinary startup
         # Just ordinary startup
-        component = self.__create_component('needed')
+        component = self.__create_component('dispensable')
         self.__check_startup(component)
         self.__check_startup(component)
         component.start()
         component.start()
         self.__check_started(component)
         self.__check_started(component)
         # Make it fail later on
         # Make it fail later on
-        self.__start_called = False
         self._timeskip()
         self._timeskip()
-        component.failed(1)
+        restarted = component.failed(1)
+        # should signal that it restarted
+        self.assertTrue(restarted)
+        # and check if it really did
+        self.__check_restarted(component)
+
+    def test_start_fail_dispensable_restart_later(self):
+        """
+        Start and then fail a dispensable component, wait a bit and try to
+        restart. Should get restarted after the wait.
+        """
+        # Just ordinary startup
+        component = self.__create_component('dispensable')
+        self.__check_startup(component)
+        component.start()
+        self.__check_started(component)
+        # Make it fail immediately
+        restarted = component.failed(1)
+        # should signal that it did not restart
+        self.assertFalse(restarted)
+        self.__check_not_restarted(component)
+        self._timeskip()
+        # try to restart again
+        restarted = component.restart()
+        # should signal that it restarted
+        self.assertTrue(restarted)
+        # and check if it really did
         self.__check_restarted(component)
         self.__check_restarted(component)
 
 
     def test_fail_core(self):
     def test_fail_core(self):
@@ -402,14 +441,56 @@ class ComponentTests(BossUtils, unittest.TestCase):
     def test_fail_dispensable(self):
     def test_fail_dispensable(self):
         """
         """
         Failure to start a dispensable component. The exception should get
         Failure to start a dispensable component. The exception should get
-        through, but it should be restarted.
+        through, but it should be restarted after a time skip.
         """
         """
         component = self.__create_component('dispensable')
         component = self.__create_component('dispensable')
         self.__check_startup(component)
         self.__check_startup(component)
         component._start_internal = self.__fail_to_start
         component._start_internal = self.__fail_to_start
         self.assertRaises(TestError, component.start)
         self.assertRaises(TestError, component.start)
+        # tell it to see if it must restart
+        restarted = component.restart()
+        # should not have restarted yet
+        self.assertFalse(restarted)
+        self.__check_not_restarted(component)
+        self._timeskip()
+        # tell it to see if it must restart and do so, with our vision of time
+        restarted = component.restart()
+        # should have restarted now
+        self.assertTrue(restarted)
+        self.__check_restarted(component)
+
+    def test_component_start_time(self):
+        """
+        Check that original start time is set initially, and remains the same
+        after a restart, while the internal __start_time does change
+        """
+        # Just ordinary startup
+        component = self.__create_component('dispensable')
+        self.__check_startup(component)
+        self.assertIsNone(component._original_start_time)
+        component.start()
+        self.__check_started(component)
+
+        self.assertIsNotNone(component._original_start_time)
+        self.assertIsNotNone(component._BaseComponent__start_time)
+        original_start_time = component._original_start_time
+        start_time = component._BaseComponent__start_time
+        # Not restarted yet, so they should be the same
+        self.assertEqual(original_start_time, start_time)
+
+        self._timeskip()
+        # Make it fail
+        restarted = component.failed(1)
+        # should signal that it restarted
+        self.assertTrue(restarted)
+        # and check if it really did
         self.__check_restarted(component)
         self.__check_restarted(component)
 
 
+        # original start time should not have changed
+        self.assertEqual(original_start_time, component._original_start_time)
+        # but actual start time should
+        self.assertNotEqual(start_time, component._BaseComponent__start_time)
+
     def test_bad_kind(self):
     def test_bad_kind(self):
         """
         """
         Test the component rejects nonsensical kinds. This includes bad
         Test the component rejects nonsensical kinds. This includes bad