Browse Source

[master] Merge branch 'trac2244'

JINMEI Tatuya 12 years ago
parent
commit
7565788d06

+ 10 - 0
src/bin/bind10/bind10_messages.mes

@@ -141,6 +141,16 @@ it now. The new configuration is printed.
 % BIND10_RECEIVED_SIGNAL received signal %1
 % BIND10_RECEIVED_SIGNAL received signal %1
 The boss module received the given signal.
 The boss module received the given signal.
 
 
+% BIND10_RESTART_COMPONENT_SKIPPED Skipped restarting a component %1
+The boss module tried to restart a component after it failed (crashed)
+unexpectedly, but the boss then found that the component had been removed
+from its local configuration of components to run.  This is an unusal
+situation but can happen if the administrator removes the component from
+the configuration after the component's crash and before the restart time.
+The boss module simply skipped restarting that module, and the whole system
+went back to the expected state (except that the crash itself is likely
+to be a bug).
+
 % BIND10_RESURRECTED_PROCESS resurrected %1 (PID %2)
 % BIND10_RESURRECTED_PROCESS resurrected %1 (PID %2)
 The given process has been restarted successfully, and is now running
 The given process has been restarted successfully, and is now running
 with the given process id.
 with the given process id.

+ 7 - 2
src/bin/bind10/bind10_src.py.in

@@ -739,7 +739,7 @@ class BoB:
                 component = self.components.pop(pid)
                 component = self.components.pop(pid)
                 logger.info(BIND10_PROCESS_ENDED, component.name(), pid,
                 logger.info(BIND10_PROCESS_ENDED, component.name(), pid,
                             exit_status)
                             exit_status)
-                if component.running() and self.runnable:
+                if component.is_running() and self.runnable:
                     # Tell it it failed. But only if it matters (we are
                     # Tell it it failed. But only if it matters (we are
                     # not shutting down and the component considers itself
                     # not shutting down and the component considers itself
                     # to be running.
                     # to be running.
@@ -771,7 +771,12 @@ class BoB:
         next_restart_time = None
         next_restart_time = None
         now = time.time()
         now = time.time()
         for component in self.components_to_restart:
         for component in self.components_to_restart:
-            if not component.restart(now):
+            # If the component was removed from the configurator between since
+            # scheduled to restart, just ignore it.  The object will just be
+            # dropped here.
+            if not self._component_configurator.has_component(component):
+                logger.info(BIND10_RESTART_COMPONENT_SKIPPED, component.name())
+            elif not component.restart(now):
                 still_dead.append(component)
                 still_dead.append(component)
                 if next_restart_time is None or\
                 if next_restart_time is None or\
                    next_restart_time > component.get_restart_time():
                    next_restart_time > component.get_restart_time():

+ 35 - 0
src/bin/bind10/tests/bind10_test.py.in

@@ -929,7 +929,14 @@ class MockComponent:
         self.name = lambda: name
         self.name = lambda: name
         self.pid = lambda: pid
         self.pid = lambda: pid
         self.address = lambda: address
         self.address = lambda: address
+        self.restarted = False
 
 
+    def get_restart_time(self):
+        return 0                # arbitrary dummy value
+
+    def restart(self, now):
+        self.restarted = True
+        return True
 
 
 class TestBossCmd(unittest.TestCase):
 class TestBossCmd(unittest.TestCase):
     def test_ping(self):
     def test_ping(self):
@@ -1266,6 +1273,34 @@ class TestBossComponents(unittest.TestCase):
         bob.start_all_components()
         bob.start_all_components()
         self.__check_extended(self.__param)
         self.__check_extended(self.__param)
 
 
+    def __setup_restart(self, bob, component):
+        '''Common procedure for restarting a component used below.'''
+        bob.components_to_restart = { component }
+        component.restarted = False
+        bob.restart_processes()
+
+    def test_restart_processes(self):
+        '''Check some behavior on restarting processes.'''
+        bob = MockBob()
+        bob.runnable = True
+        component = MockComponent('test', 53)
+
+        # A component to be restarted will actually be restarted iff it's
+        # in the configurator's configuration.
+        # We bruteforce the configurator internal below; ugly, but the easiest
+        # way for the test.
+        bob._component_configurator._components['test'] = (None, component)
+        self.__setup_restart(bob, component)
+        self.assertTrue(component.restarted)
+        self.assertFalse(component in bob.components_to_restart)
+
+        # Remove the component from the configuration.  It won't be restarted
+        # even if scheduled, nor will remain in the to-be-restarted list.
+        del bob._component_configurator._components['test']
+        self.__setup_restart(bob, component)
+        self.assertFalse(component.restarted)
+        self.assertFalse(component in bob.components_to_restart)
+
 class SocketSrvTest(unittest.TestCase):
 class SocketSrvTest(unittest.TestCase):
     """
     """
     This tests some methods of boss related to the unix domain sockets used
     This tests some methods of boss related to the unix domain sockets used

+ 27 - 9
src/lib/python/isc/bind10/component.py

@@ -45,6 +45,7 @@ COMPONENT_RESTART_DELAY = 10
 
 
 STATE_DEAD = 'dead'
 STATE_DEAD = 'dead'
 STATE_STOPPED = 'stopped'
 STATE_STOPPED = 'stopped'
+STATE_RESTARTING = 'restarting'
 STATE_RUNNING = 'running'
 STATE_RUNNING = 'running'
 
 
 def get_signame(signal_number):
 def get_signame(signal_number):
@@ -68,6 +69,7 @@ class BaseComponent:
       explicitly).
       explicitly).
     - Running - after start() was called, it started successfully and is
     - Running - after start() was called, it started successfully and is
       now running.
       now running.
+    - Restarting - the component failed (crashed) and is waiting for a restart
     - Dead - it failed and can not be resurrected.
     - Dead - it failed and can not be resurrected.
 
 
     Init
     Init
@@ -79,11 +81,11 @@ class BaseComponent:
                     |            |                |
                     |            |                |
                     |failure     | failed()       |
                     |failure     | failed()       |
                     |            |                |
                     |            |                |
-                    v            |                |
+                    v            |                | start()/restart()
                     +<-----------+                |
                     +<-----------+                |
                     |                             |
                     |                             |
                     |  kind == dispensable or kind|== needed and failed late
                     |  kind == dispensable or kind|== needed and failed late
-                    +-----------------------------+
+                    +-----------------------> Restarting
                     |
                     |
                     | kind == core or kind == needed and it failed too soon
                     | kind == core or kind == needed and it failed too soon
                     v
                     v
@@ -163,7 +165,7 @@ class BaseComponent:
         """
         """
         if self.__state == STATE_DEAD:
         if self.__state == STATE_DEAD:
             raise ValueError("Can't resurrect already dead component")
             raise ValueError("Can't resurrect already dead component")
-        if self.running():
+        if self.is_running():
             raise ValueError("Can't start already running component")
             raise ValueError("Can't start already running component")
         logger.info(BIND10_COMPONENT_START, self.name())
         logger.info(BIND10_COMPONENT_START, self.name())
         self.__state = STATE_RUNNING
         self.__state = STATE_RUNNING
@@ -188,7 +190,7 @@ class BaseComponent:
         """
         """
         # This is not tested. It talks with the outher world, which is out
         # This is not tested. It talks with the outher world, which is out
         # of scope of unittests.
         # of scope of unittests.
-        if not self.running():
+        if not self.is_running():
             raise ValueError("Can't stop a component which is not running")
             raise ValueError("Can't stop a component which is not running")
         logger.info(BIND10_COMPONENT_STOP, self.name())
         logger.info(BIND10_COMPONENT_STOP, self.name())
         self.__state = STATE_STOPPED
         self.__state = STATE_STOPPED
@@ -234,9 +236,9 @@ class BaseComponent:
 
 
         logger.error(BIND10_COMPONENT_FAILED, self.name(), self.pid(),
         logger.error(BIND10_COMPONENT_FAILED, self.name(), self.pid(),
                      exit_str)
                      exit_str)
-        if not self.running():
+        if not self.is_running():
             raise ValueError("Can't fail component that isn't running")
             raise ValueError("Can't fail component that isn't running")
-        self.__state = STATE_STOPPED
+        self.__state = STATE_RESTARTING # tentatively set, maybe changed to DEAD
         self._failed_internal()
         self._failed_internal()
         # If it is a core component or the needed component failed to start
         # If it is a core component or the needed component failed to start
         # (including it stopped really soon)
         # (including it stopped really soon)
@@ -284,7 +286,7 @@ class BaseComponent:
         else:
         else:
             return False
             return False
 
 
-    def running(self):
+    def is_running(self):
         """
         """
         Informs if the component is currently running. It assumes the failed
         Informs if the component is currently running. It assumes the failed
         is called whenever the component really fails and there might be some
         is called whenever the component really fails and there might be some
@@ -296,6 +298,15 @@ class BaseComponent:
         """
         """
         return self.__state == STATE_RUNNING
         return self.__state == STATE_RUNNING
 
 
+    def is_restarting(self):
+        """Informs if the component has failed and is waiting for a restart.
+
+        Unlike the case of is_running(), if this returns True it always means
+        the corresponding process has died and not yet restarted.
+
+        """
+        return self.__state == STATE_RESTARTING
+
     def _start_internal(self):
     def _start_internal(self):
         """
         """
         This method does the actual starting of a process. You need to override
         This method does the actual starting of a process. You need to override
@@ -560,6 +571,13 @@ class Configurator:
         self._running = False
         self._running = False
         self.__reconfigure_internal(self._components, {})
         self.__reconfigure_internal(self._components, {})
 
 
+    def has_component(self, component):
+        '''Return if a specified component is configured.'''
+        # Values of self._components are tuples of (config, component).
+        # Extract the components of the tuples and see if the given one
+        # is included.
+        return component in map(lambda x: x[1], self._components.values())
+
     def reconfigure(self, configuration):
     def reconfigure(self, configuration):
         """
         """
         Changes configuration from the current one to the provided. It
         Changes configuration from the current one to the provided. It
@@ -591,7 +609,7 @@ class Configurator:
         for cname in old.keys():
         for cname in old.keys():
             if cname not in new:
             if cname not in new:
                 component = self._components[cname][1]
                 component = self._components[cname][1]
-                if component.running():
+                if component.is_running() or component.is_restarting():
                     plan.append({
                     plan.append({
                         'command': STOP_CMD,
                         'command': STOP_CMD,
                         'component': component,
                         'component': component,
@@ -674,7 +692,7 @@ class Configurator:
                     self._components[task['name']] = (task['config'],
                     self._components[task['name']] = (task['config'],
                                                       component)
                                                       component)
                 elif command == STOP_CMD:
                 elif command == STOP_CMD:
-                    if component.running():
+                    if component.is_running():
                         component.stop()
                         component.stop()
                     del self._components[task['name']]
                     del self._components[task['name']]
                 else:
                 else:

+ 43 - 22
src/lib/python/isc/bind10/tests/component_test.py

@@ -191,7 +191,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertFalse(self.__start_called)
         self.assertFalse(self.__start_called)
         self.assertFalse(self.__stop_called)
         self.assertFalse(self.__stop_called)
         self.assertFalse(self.__failed_called)
         self.assertFalse(self.__failed_called)
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
+        self.assertFalse(component.is_restarting())
         # We can't stop or fail the component yet
         # We can't stop or fail the component yet
         self.assertRaises(ValueError, component.stop)
         self.assertRaises(ValueError, component.stop)
         self.assertRaises(ValueError, component.failed, 1)
         self.assertRaises(ValueError, component.failed, 1)
@@ -204,7 +205,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertTrue(self.__start_called)
         self.assertTrue(self.__start_called)
         self.assertFalse(self.__stop_called)
         self.assertFalse(self.__stop_called)
         self.assertFalse(self.__failed_called)
         self.assertFalse(self.__failed_called)
-        self.assertTrue(component.running())
+        self.assertTrue(component.is_running())
+        self.assertFalse(component.is_restarting())
 
 
     def __check_dead(self, component):
     def __check_dead(self, component):
         """
         """
@@ -215,7 +217,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertFalse(self.__stop_called)
         self.assertFalse(self.__stop_called)
         self.assertTrue(self.__failed_called)
         self.assertTrue(self.__failed_called)
         self.assertEqual(1, self._exitcode)
         self.assertEqual(1, self._exitcode)
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
+        self.assertFalse(component.is_restarting())
         # Surely it can't be stopped when already dead
         # Surely it can't be stopped when already dead
         self.assertRaises(ValueError, component.stop)
         self.assertRaises(ValueError, component.stop)
         # Nor started
         # Nor started
@@ -234,7 +237,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertTrue(self.__start_called)
         self.assertTrue(self.__start_called)
         self.assertFalse(self.__stop_called)
         self.assertFalse(self.__stop_called)
         self.assertTrue(self.__failed_called)
         self.assertTrue(self.__failed_called)
-        self.assertTrue(component.running())
+        self.assertTrue(component.is_running())
+        self.assertFalse(component.is_restarting())
         # Check it can't be started again
         # Check it can't be started again
         self.assertRaises(ValueError, component.start)
         self.assertRaises(ValueError, component.start)
 
 
@@ -246,7 +250,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertTrue(self.__start_called)
         self.assertTrue(self.__start_called)
         self.assertFalse(self.__stop_called)
         self.assertFalse(self.__stop_called)
         self.assertTrue(self.__failed_called)
         self.assertTrue(self.__failed_called)
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
+        self.assertTrue(component.is_restarting())
 
 
     def __do_start_stop(self, kind):
     def __do_start_stop(self, kind):
         """
         """
@@ -270,7 +275,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertTrue(self.__start_called)
         self.assertTrue(self.__start_called)
         self.assertTrue(self.__stop_called)
         self.assertTrue(self.__stop_called)
         self.assertFalse(self.__failed_called)
         self.assertFalse(self.__failed_called)
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
+        self.assertFalse(component.is_restarting())
         # Check it can't be stopped twice
         # Check it can't be stopped twice
         self.assertRaises(ValueError, component.stop)
         self.assertRaises(ValueError, component.stop)
         # Or failed
         # Or failed
@@ -553,10 +559,10 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertIsNone(component.pid())
         self.assertIsNone(component.pid())
         self.assertEqual(['hello'], component._params)
         self.assertEqual(['hello'], component._params)
         self.assertEqual('Address', component._address)
         self.assertEqual('Address', component._address)
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
         self.assertEqual({}, self.__registered_processes)
         self.assertEqual({}, self.__registered_processes)
         component.start()
         component.start()
-        self.assertTrue(component.running())
+        self.assertTrue(component.is_running())
         # Some versions of unittest miss assertIsInstance
         # Some versions of unittest miss assertIsInstance
         self.assertTrue(isinstance(component._procinfo, TestProcInfo))
         self.assertTrue(isinstance(component._procinfo, TestProcInfo))
         self.assertEqual(42, component.pid())
         self.assertEqual(42, component.pid())
@@ -580,11 +586,11 @@ class ComponentTests(BossUtils, unittest.TestCase):
         """
         """
         component = Component('component', self, 'needed', 'Address')
         component = Component('component', self, 'needed', 'Address')
         component.start()
         component.start()
-        self.assertTrue(component.running())
+        self.assertTrue(component.is_running())
         self.assertEqual('component', self.__start_simple_params)
         self.assertEqual('component', self.__start_simple_params)
         component.pid = lambda: 42
         component.pid = lambda: 42
         component.stop()
         component.stop()
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
         self.assertEqual(('component', 'Address', 42),
         self.assertEqual(('component', 'Address', 42),
                          self.__stop_process_params)
                          self.__stop_process_params)
 
 
@@ -609,7 +615,7 @@ class ComponentTests(BossUtils, unittest.TestCase):
         component = Component('component', self, 'needed', 'Address',
         component = Component('component', self, 'needed', 'Address',
                               [], ProcInfo)
                               [], ProcInfo)
         component.start()
         component.start()
-        self.assertTrue(component.running())
+        self.assertTrue(component.is_running())
         component.kill()
         component.kill()
         self.assertTrue(process.terminated)
         self.assertTrue(process.terminated)
         self.assertFalse(process.killed)
         self.assertFalse(process.killed)
@@ -872,12 +878,13 @@ class ConfiguratorTest(BossUtils, unittest.TestCase):
             'priority': 6,
             'priority': 6,
             'special': 'test',
             'special': 'test',
             'process': 'additional',
             'process': 'additional',
-            'kind': 'needed'
+            'kind': 'dispensable' # need to be dispensable so it could restart
         }
         }
         self.log = []
         self.log = []
         plan = configurator._build_plan(self.__build_components(self.__core),
         plan = configurator._build_plan(self.__build_components(self.__core),
                                         bigger)
                                         bigger)
-        self.assertEqual([('additional', 'init'), ('additional', 'needed')],
+        self.assertEqual([('additional', 'init'),
+                          ('additional', 'dispensable')],
                          self.log)
                          self.log)
         self.assertEqual(1, len(plan))
         self.assertEqual(1, len(plan))
         self.assertTrue('component' in plan[0])
         self.assertTrue('component' in plan[0])
@@ -888,15 +895,29 @@ class ConfiguratorTest(BossUtils, unittest.TestCase):
         # Now remove the one component again
         # Now remove the one component again
         # We run the plan so the component is wired into internal structures
         # We run the plan so the component is wired into internal structures
         configurator._run_plan(plan)
         configurator._run_plan(plan)
-        self.log = []
-        plan = configurator._build_plan(self.__build_components(bigger),
-                                        self.__core)
-        self.assertEqual([], self.log)
-        self.assertEqual([{
-            'command': 'stop',
-            'name': 'additional',
-            'component': component
-        }], plan)
+        # We should have the 'additional' component in the configurator.
+        self.assertTrue(configurator.has_component(component))
+        for count in [0, 1]:    # repeat two times, see below
+            self.log = []
+            plan = configurator._build_plan(self.__build_components(bigger),
+                                            self.__core)
+            self.assertEqual([], self.log)
+            self.assertEqual([{
+                        'command': 'stop',
+                        'name': 'additional',
+                        'component': component
+                        }], plan)
+
+            if count is 0:
+                # We then emulate unexpected failure of the component (but
+                # before it restarts).  It shouldn't confuse the
+                # configurator in the second phase of the test
+                component.failed(0)
+        # Run the plan, confirm the specified component is gone.
+        configurator._run_plan(plan)
+        self.assertFalse(configurator.has_component(component))
+        # There shouldn't be any other object that has the same name
+        self.assertFalse('additional' in configurator._components)
 
 
         # We want to switch a component. So, prepare the configurator so it
         # We want to switch a component. So, prepare the configurator so it
         # holds one
         # holds one