Browse Source

[master] Merge branch 'trac2244'

JINMEI Tatuya 12 years ago
parent
commit
7565788d06

+ 10 - 0
src/bin/bind10/bind10_messages.mes

@@ -141,6 +141,16 @@ it now. The new configuration is printed.
 % BIND10_RECEIVED_SIGNAL received signal %1
 The boss module received the given signal.
 
+% BIND10_RESTART_COMPONENT_SKIPPED Skipped restarting a component %1
+The boss module tried to restart a component after it failed (crashed)
+unexpectedly, but the boss then found that the component had been removed
+from its local configuration of components to run.  This is an unusal
+situation but can happen if the administrator removes the component from
+the configuration after the component's crash and before the restart time.
+The boss module simply skipped restarting that module, and the whole system
+went back to the expected state (except that the crash itself is likely
+to be a bug).
+
 % BIND10_RESURRECTED_PROCESS resurrected %1 (PID %2)
 The given process has been restarted successfully, and is now running
 with the given process id.

+ 7 - 2
src/bin/bind10/bind10_src.py.in

@@ -739,7 +739,7 @@ class BoB:
                 component = self.components.pop(pid)
                 logger.info(BIND10_PROCESS_ENDED, component.name(), pid,
                             exit_status)
-                if component.running() and self.runnable:
+                if component.is_running() and self.runnable:
                     # Tell it it failed. But only if it matters (we are
                     # not shutting down and the component considers itself
                     # to be running.
@@ -771,7 +771,12 @@ class BoB:
         next_restart_time = None
         now = time.time()
         for component in self.components_to_restart:
-            if not component.restart(now):
+            # If the component was removed from the configurator between since
+            # scheduled to restart, just ignore it.  The object will just be
+            # dropped here.
+            if not self._component_configurator.has_component(component):
+                logger.info(BIND10_RESTART_COMPONENT_SKIPPED, component.name())
+            elif not component.restart(now):
                 still_dead.append(component)
                 if next_restart_time is None or\
                    next_restart_time > component.get_restart_time():

+ 35 - 0
src/bin/bind10/tests/bind10_test.py.in

@@ -929,7 +929,14 @@ class MockComponent:
         self.name = lambda: name
         self.pid = lambda: pid
         self.address = lambda: address
+        self.restarted = False
 
+    def get_restart_time(self):
+        return 0                # arbitrary dummy value
+
+    def restart(self, now):
+        self.restarted = True
+        return True
 
 class TestBossCmd(unittest.TestCase):
     def test_ping(self):
@@ -1266,6 +1273,34 @@ class TestBossComponents(unittest.TestCase):
         bob.start_all_components()
         self.__check_extended(self.__param)
 
+    def __setup_restart(self, bob, component):
+        '''Common procedure for restarting a component used below.'''
+        bob.components_to_restart = { component }
+        component.restarted = False
+        bob.restart_processes()
+
+    def test_restart_processes(self):
+        '''Check some behavior on restarting processes.'''
+        bob = MockBob()
+        bob.runnable = True
+        component = MockComponent('test', 53)
+
+        # A component to be restarted will actually be restarted iff it's
+        # in the configurator's configuration.
+        # We bruteforce the configurator internal below; ugly, but the easiest
+        # way for the test.
+        bob._component_configurator._components['test'] = (None, component)
+        self.__setup_restart(bob, component)
+        self.assertTrue(component.restarted)
+        self.assertFalse(component in bob.components_to_restart)
+
+        # Remove the component from the configuration.  It won't be restarted
+        # even if scheduled, nor will remain in the to-be-restarted list.
+        del bob._component_configurator._components['test']
+        self.__setup_restart(bob, component)
+        self.assertFalse(component.restarted)
+        self.assertFalse(component in bob.components_to_restart)
+
 class SocketSrvTest(unittest.TestCase):
     """
     This tests some methods of boss related to the unix domain sockets used

+ 27 - 9
src/lib/python/isc/bind10/component.py

@@ -45,6 +45,7 @@ COMPONENT_RESTART_DELAY = 10
 
 STATE_DEAD = 'dead'
 STATE_STOPPED = 'stopped'
+STATE_RESTARTING = 'restarting'
 STATE_RUNNING = 'running'
 
 def get_signame(signal_number):
@@ -68,6 +69,7 @@ class BaseComponent:
       explicitly).
     - Running - after start() was called, it started successfully and is
       now running.
+    - Restarting - the component failed (crashed) and is waiting for a restart
     - Dead - it failed and can not be resurrected.
 
     Init
@@ -79,11 +81,11 @@ class BaseComponent:
                     |            |                |
                     |failure     | failed()       |
                     |            |                |
-                    v            |                |
+                    v            |                | start()/restart()
                     +<-----------+                |
                     |                             |
                     |  kind == dispensable or kind|== needed and failed late
-                    +-----------------------------+
+                    +-----------------------> Restarting
                     |
                     | kind == core or kind == needed and it failed too soon
                     v
@@ -163,7 +165,7 @@ class BaseComponent:
         """
         if self.__state == STATE_DEAD:
             raise ValueError("Can't resurrect already dead component")
-        if self.running():
+        if self.is_running():
             raise ValueError("Can't start already running component")
         logger.info(BIND10_COMPONENT_START, self.name())
         self.__state = STATE_RUNNING
@@ -188,7 +190,7 @@ class BaseComponent:
         """
         # This is not tested. It talks with the outher world, which is out
         # of scope of unittests.
-        if not self.running():
+        if not self.is_running():
             raise ValueError("Can't stop a component which is not running")
         logger.info(BIND10_COMPONENT_STOP, self.name())
         self.__state = STATE_STOPPED
@@ -234,9 +236,9 @@ class BaseComponent:
 
         logger.error(BIND10_COMPONENT_FAILED, self.name(), self.pid(),
                      exit_str)
-        if not self.running():
+        if not self.is_running():
             raise ValueError("Can't fail component that isn't running")
-        self.__state = STATE_STOPPED
+        self.__state = STATE_RESTARTING # tentatively set, maybe changed to DEAD
         self._failed_internal()
         # If it is a core component or the needed component failed to start
         # (including it stopped really soon)
@@ -284,7 +286,7 @@ class BaseComponent:
         else:
             return False
 
-    def running(self):
+    def is_running(self):
         """
         Informs if the component is currently running. It assumes the failed
         is called whenever the component really fails and there might be some
@@ -296,6 +298,15 @@ class BaseComponent:
         """
         return self.__state == STATE_RUNNING
 
+    def is_restarting(self):
+        """Informs if the component has failed and is waiting for a restart.
+
+        Unlike the case of is_running(), if this returns True it always means
+        the corresponding process has died and not yet restarted.
+
+        """
+        return self.__state == STATE_RESTARTING
+
     def _start_internal(self):
         """
         This method does the actual starting of a process. You need to override
@@ -560,6 +571,13 @@ class Configurator:
         self._running = False
         self.__reconfigure_internal(self._components, {})
 
+    def has_component(self, component):
+        '''Return if a specified component is configured.'''
+        # Values of self._components are tuples of (config, component).
+        # Extract the components of the tuples and see if the given one
+        # is included.
+        return component in map(lambda x: x[1], self._components.values())
+
     def reconfigure(self, configuration):
         """
         Changes configuration from the current one to the provided. It
@@ -591,7 +609,7 @@ class Configurator:
         for cname in old.keys():
             if cname not in new:
                 component = self._components[cname][1]
-                if component.running():
+                if component.is_running() or component.is_restarting():
                     plan.append({
                         'command': STOP_CMD,
                         'component': component,
@@ -674,7 +692,7 @@ class Configurator:
                     self._components[task['name']] = (task['config'],
                                                       component)
                 elif command == STOP_CMD:
-                    if component.running():
+                    if component.is_running():
                         component.stop()
                     del self._components[task['name']]
                 else:

+ 43 - 22
src/lib/python/isc/bind10/tests/component_test.py

@@ -191,7 +191,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertFalse(self.__start_called)
         self.assertFalse(self.__stop_called)
         self.assertFalse(self.__failed_called)
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
+        self.assertFalse(component.is_restarting())
         # We can't stop or fail the component yet
         self.assertRaises(ValueError, component.stop)
         self.assertRaises(ValueError, component.failed, 1)
@@ -204,7 +205,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertTrue(self.__start_called)
         self.assertFalse(self.__stop_called)
         self.assertFalse(self.__failed_called)
-        self.assertTrue(component.running())
+        self.assertTrue(component.is_running())
+        self.assertFalse(component.is_restarting())
 
     def __check_dead(self, component):
         """
@@ -215,7 +217,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertFalse(self.__stop_called)
         self.assertTrue(self.__failed_called)
         self.assertEqual(1, self._exitcode)
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
+        self.assertFalse(component.is_restarting())
         # Surely it can't be stopped when already dead
         self.assertRaises(ValueError, component.stop)
         # Nor started
@@ -234,7 +237,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertTrue(self.__start_called)
         self.assertFalse(self.__stop_called)
         self.assertTrue(self.__failed_called)
-        self.assertTrue(component.running())
+        self.assertTrue(component.is_running())
+        self.assertFalse(component.is_restarting())
         # Check it can't be started again
         self.assertRaises(ValueError, component.start)
 
@@ -246,7 +250,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertTrue(self.__start_called)
         self.assertFalse(self.__stop_called)
         self.assertTrue(self.__failed_called)
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
+        self.assertTrue(component.is_restarting())
 
     def __do_start_stop(self, kind):
         """
@@ -270,7 +275,8 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertTrue(self.__start_called)
         self.assertTrue(self.__stop_called)
         self.assertFalse(self.__failed_called)
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
+        self.assertFalse(component.is_restarting())
         # Check it can't be stopped twice
         self.assertRaises(ValueError, component.stop)
         # Or failed
@@ -553,10 +559,10 @@ class ComponentTests(BossUtils, unittest.TestCase):
         self.assertIsNone(component.pid())
         self.assertEqual(['hello'], component._params)
         self.assertEqual('Address', component._address)
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
         self.assertEqual({}, self.__registered_processes)
         component.start()
-        self.assertTrue(component.running())
+        self.assertTrue(component.is_running())
         # Some versions of unittest miss assertIsInstance
         self.assertTrue(isinstance(component._procinfo, TestProcInfo))
         self.assertEqual(42, component.pid())
@@ -580,11 +586,11 @@ class ComponentTests(BossUtils, unittest.TestCase):
         """
         component = Component('component', self, 'needed', 'Address')
         component.start()
-        self.assertTrue(component.running())
+        self.assertTrue(component.is_running())
         self.assertEqual('component', self.__start_simple_params)
         component.pid = lambda: 42
         component.stop()
-        self.assertFalse(component.running())
+        self.assertFalse(component.is_running())
         self.assertEqual(('component', 'Address', 42),
                          self.__stop_process_params)
 
@@ -609,7 +615,7 @@ class ComponentTests(BossUtils, unittest.TestCase):
         component = Component('component', self, 'needed', 'Address',
                               [], ProcInfo)
         component.start()
-        self.assertTrue(component.running())
+        self.assertTrue(component.is_running())
         component.kill()
         self.assertTrue(process.terminated)
         self.assertFalse(process.killed)
@@ -872,12 +878,13 @@ class ConfiguratorTest(BossUtils, unittest.TestCase):
             'priority': 6,
             'special': 'test',
             'process': 'additional',
-            'kind': 'needed'
+            'kind': 'dispensable' # need to be dispensable so it could restart
         }
         self.log = []
         plan = configurator._build_plan(self.__build_components(self.__core),
                                         bigger)
-        self.assertEqual([('additional', 'init'), ('additional', 'needed')],
+        self.assertEqual([('additional', 'init'),
+                          ('additional', 'dispensable')],
                          self.log)
         self.assertEqual(1, len(plan))
         self.assertTrue('component' in plan[0])
@@ -888,15 +895,29 @@ class ConfiguratorTest(BossUtils, unittest.TestCase):
         # Now remove the one component again
         # We run the plan so the component is wired into internal structures
         configurator._run_plan(plan)
-        self.log = []
-        plan = configurator._build_plan(self.__build_components(bigger),
-                                        self.__core)
-        self.assertEqual([], self.log)
-        self.assertEqual([{
-            'command': 'stop',
-            'name': 'additional',
-            'component': component
-        }], plan)
+        # We should have the 'additional' component in the configurator.
+        self.assertTrue(configurator.has_component(component))
+        for count in [0, 1]:    # repeat two times, see below
+            self.log = []
+            plan = configurator._build_plan(self.__build_components(bigger),
+                                            self.__core)
+            self.assertEqual([], self.log)
+            self.assertEqual([{
+                        'command': 'stop',
+                        'name': 'additional',
+                        'component': component
+                        }], plan)
+
+            if count is 0:
+                # We then emulate unexpected failure of the component (but
+                # before it restarts).  It shouldn't confuse the
+                # configurator in the second phase of the test
+                component.failed(0)
+        # Run the plan, confirm the specified component is gone.
+        configurator._run_plan(plan)
+        self.assertFalse(configurator.has_component(component))
+        # There shouldn't be any other object that has the same name
+        self.assertFalse('additional' in configurator._components)
 
         # We want to switch a component. So, prepare the configurator so it
         # holds one