Browse Source

[2020] introduce waiting periods and retries if add_remote_config fails.

also, make sure catching ModuleSpecError in addition to ModuleCCSessionError
because that's actually raised when the speicified module is not running
(seems buggy, but that happens in the real world).
JINMEI Tatuya 13 years ago
parent
commit
43feaa8658
3 changed files with 100 additions and 36 deletions
  1. 39 9
      src/bin/ddns/ddns.py.in
  2. 19 15
      src/bin/ddns/ddns_messages.mes
  3. 42 12
      src/bin/ddns/tests/ddns_test.py

+ 39 - 9
src/bin/ddns/ddns.py.in

@@ -25,6 +25,7 @@ import isc.ddns.session
 from isc.ddns.zone_config import ZoneConfig
 from isc.ddns.logger import ClientFormatter, ZoneFormatter
 from isc.config.ccsession import *
+from isc.config.module_spec import ModuleSpecError
 from isc.cc import SessionError, SessionTimeout, ProtocolError
 import isc.util.process
 import isc.util.cio.socketsession
@@ -34,6 +35,7 @@ from isc.server_common.dns_tcp import DNSTCPContext
 from isc.datasrc import DataSourceClient
 from isc.server_common.auth_command import auth_loadzone_command
 import select
+import time
 import errno
 
 from isc.log_messages.ddns_messages import *
@@ -151,6 +153,14 @@ def get_datasrc_client(cc_session):
                 raise isc.datasrc.Error(self.__ex)
         return (HARDCODED_DATASRC_CLASS, DummyDataSourceClient(ex), file)
 
+def add_pause(sec):
+    '''Pause a specified period for inter module synchronization.
+
+    This is a trivial wrraper of time.sleep, but defined as a separate function
+    so tests can customize it.
+    '''
+    time.sleep(sec)
+
 class DDNSServer:
     # The number of TCP clients that can be handled by the server at the same
     # time (this should be configurable parameter).
@@ -192,14 +202,9 @@ class DDNSServer:
         self._secondary_zones = None
 
         # Get necessary configurations from remote modules.
-        try:
-            mods = [(AUTH_MODULE_NAME, self.__auth_config_handler),
-                    (ZONEMGR_MODULE_NAME, self.__zonemgr_config_handler)]
-            for mod in mods:
-                self._cc.add_remote_config_by_name(mod[0], mod[1])
-        except ModuleCCSessionError as ex:
-            logger.error(DDNS_GET_REMOTE_CONFIG_FAIL, mod[0], ex)
-            raise ex    # propagate it and die for now
+        for mod in [(AUTH_MODULE_NAME, self.__auth_config_handler),
+                    (ZONEMGR_MODULE_NAME, self.__zonemgr_config_handler)]:
+            self.__add_remote_module(mod[0], mod[1])
         # This should succeed as long as cfgmgr is up.
         isc.server_common.tsig_keyring.init_keyring(self._cc)
 
@@ -274,6 +279,31 @@ class DDNSServer:
             answer = create_answer(1, "Unknown command: " + str(cmd))
         return answer
 
+    def __add_remote_module(self, mod_name, callback):
+        '''Register interest in other module's config with a callback.'''
+
+        # Due to startup timing, add_remote_config can fail.  We could make it
+        # more sophisticated, but for now we simply retry a few times, each
+        # separated by a short period (3 times and 1 sec, arbitrary chosen,
+        # and hardcoded for now).  In practice this should be more than
+        # sufficient, but if it turns out to be a bigger problem we can
+        # consider more elegant solutions.
+        for n_try in range(0, 3):
+            try:
+                # by_name() version can fail with ModuleSpecError in getting
+                # the module spec because cfgmgr returns a "successful" answer
+                # with empty data if it cannot find the specified module.
+                # This seems to be a deviant behavior (see Trac #2039), but
+                # we need to deal with it.
+                self._cc.add_remote_config_by_name(mod_name, callback)
+                return
+            except (ModuleSpecError, ModuleCCSessionError) as ex:
+                logger.warn(DDNS_GET_REMOTE_CONFIG_FAIL, mod_name, n_try + 1,
+                            ex)
+                last_ex = ex
+                add_pause(1)
+        raise last_ex
+
     def __auth_config_handler(self, new_config, module_config):
         logger.info(DDNS_RECEIVED_AUTH_UPDATE)
 
@@ -679,7 +709,7 @@ def main(ddns_server=None):
         logger.info(DDNS_STOPPED_BY_KEYBOARD)
     except SessionError as e:
         logger.error(DDNS_CC_SESSION_ERROR, str(e))
-    except ModuleCCSessionError as e:
+    except (ModuleSpecError, ModuleCCSessionError) as e:
         logger.error(DDNS_MODULECC_SESSION_ERROR, str(e))
     except DDNSConfigError as e:
         logger.error(DDNS_CONFIG_ERROR, str(e))

+ 19 - 15
src/bin/ddns/ddns_messages.mes

@@ -59,24 +59,28 @@ authoritative server shuts down, the connection would get closed. It also
 can mean the system is busy and can't keep up or that the other side got
 confused and sent bad data.
 
-% DDNS_GET_REMOTE_CONFIG_FAIL failed to get %1 module configuration: %2
+% DDNS_GET_REMOTE_CONFIG_FAIL failed to get %1 module configuration %2 times: %3
 b10-ddns tried to get configuration of some remote modules for its
 operation, but it failed.  The most likely cause of this is that the
 remote module has not fully started up and b10-ddns couldn't get the
-configuration in a timely fashion.  In the current implementation
-b10-ddns considers it a fatal error and terminates; however, as long
-as b10-ddns is configured as a "dispensable" component (which is the
-default), the parent bind10 process will restart it, and getting the
-remote configuration will eventually (and soon) succeed.  This is not
-the optimal behavior, but should be sufficient in practice.  If it
-really causes an operational trouble other than having a few of this
-log messages, please submit a bug report; there can be several ways to
-make it more sophisticated.  Another, less likely reason for having
-this error is because the remote modules are not actually configured
-to run.  If that's the case fixing the configuration should solve the
-problem - either by making sure the remote module will run or by not
-running b10-ddns (without these remote modules b10-ddns is not
-functional, so there's no point in running it in this case).
+configuration in a timely fashion.  b10-ddns attempts to retry it a
+few times, imposing a short delay, hoping it eventually succeeds if
+it's just a timing issue.  The number of total failed attempts is also
+logged.  If it reaches an internal threshold b10-ddns considers it a
+fatal error and terminates.  Even in that case, if b10-ddns is
+configured as a "dispensable" component (which is the default), the
+parent bind10 process will restart it, and there will be another
+chance of getting the remote configuration successfully.  These are
+not the optimal behavior, but it's believed to be sufficient in
+practice (there would normally be no failure in the first place).  If
+it really causes an operational trouble other than having a few of
+these log messages, please submit a bug report; there can be several
+ways to make it more sophisticated.  Another, less likely reason for
+having this error is because the remote modules are not actually
+configured to run.  If that's the case fixing the configuration should
+solve the problem - either by making sure the remote module will run
+or by not running b10-ddns (without these remote modules b10-ddns is
+not functional, so there's no point in running it in this case).
 
 % DDNS_MODULECC_SESSION_ERROR error encountered by configuration/command module: %1
 There was a problem in the lower level module handling configuration and

+ 42 - 12
src/bin/ddns/tests/ddns_test.py

@@ -24,6 +24,7 @@ from isc.datasrc import DataSourceClient
 from isc.config import module_spec_from_file
 from isc.config.config_data import ConfigData
 from isc.config.ccsession import create_answer, ModuleCCSessionError
+from isc.config.module_spec import ModuleSpecError
 from isc.server_common.dns_tcp import DNSTCPContext
 import ddns
 import errno
@@ -217,8 +218,8 @@ class MyCCSession(isc.config.ConfigData):
 
         # Attributes to handle (faked) remote configurations
         self.__callbacks = {}   # record callbacks for updates to remote confs
-        self._raise_mods = set()  # set of modules that triggers exception
-                                  # on add_remote.  settable by tests.
+        self._raise_mods = {}  # map of module to exceptions to be triggered
+                               # on add_remote.  settable by tests.
         self._auth_config = {}  # faked auth cfg, settable by tests
         self._zonemgr_config = {} # faked zonemgr cfg, settable by tests
 
@@ -237,8 +238,15 @@ class MyCCSession(isc.config.ConfigData):
         return FakeSocket(1)
 
     def add_remote_config_by_name(self, module_name, update_callback=None):
-        if module_name in self._raise_mods:
-            raise ModuleCCSessionError('Failure requesting remote config data')
+        # If a list of exceptions is given for the module, raise the front one,
+        # removing that exception from the list (so the list length controls
+        # how many (and which) exceptions should be raised on add_remote).
+        if module_name in self._raise_mods.keys() and \
+                len(self._raise_mods[module_name]) != 0:
+            ex = self._raise_mods[module_name][0]
+            self._raise_mods[module_name] = self._raise_mods[module_name][1:]
+            raise ex('Failure requesting remote config data')
+
         if update_callback is not None:
             self.__callbacks[module_name] = update_callback
         if module_name is 'Auth':
@@ -340,12 +348,15 @@ class TestDDNSServer(unittest.TestCase):
         self.__tcp_sock = FakeSocket(10, socket.IPPROTO_TCP)
         self.__tcp_ctx = DNSTCPContext(self.__tcp_sock)
         self.__tcp_data = b'A' * 12 # dummy, just the same size as DNS header
+        # some tests will override this, which will be restored in tearDown:
+        self.__orig_add_pause = ddns.add_pause
 
     def tearDown(self):
         ddns.select.select = select.select
         ddns.isc.util.cio.socketsession.SocketSessionReceiver = \
             isc.util.cio.socketsession.SocketSessionReceiver
         isc.server_common.tsig_keyring = self.orig_tsig_keyring
+        ddns.add_pause = self.__orig_add_pause
 
     def test_listen(self):
         '''
@@ -532,16 +543,35 @@ class TestDDNSServer(unittest.TestCase):
         self.assertEqual({(TEST_ZONE_NAME, TEST_RRCLASS)},
                          self.ddns_server._secondary_zones)
 
+    def __check_remote_config_fail(self, mod_name, num_ex, expected_ex):
+        '''Subroutine for remote_config_fail test.'''
+
+        # fake pause function for inspection and to avoid having timeouts
+        added_pause = []
+        ddns.add_pause = lambda sec: added_pause.append(sec)
+
+        # In our current implementation, there will be up to 3 tries of
+        # adding the module, each separated by a 1-sec pause.  If all attempts
+        # fail the exception will be propagated.
+        exceptions = [expected_ex for i in range(0, num_ex)]
+        self.__cc_session._raise_mods = {mod_name: exceptions}
+        if num_ex >= 3:
+            self.assertRaises(expected_ex, ddns.DDNSServer, self.__cc_session)
+        else:
+            ddns.DDNSServer(self.__cc_session)
+        self.assertEqual([1 for i in range(0, num_ex)], added_pause)
+
     def test_remote_config_fail(self):
         # If getting config of Auth or Zonemgr fails on construction of
-        # DDNServer, it should result in ModuleCCSessionError.
-        self.__cc_session._raise_mods.add('Auth')
-        self.assertRaises(ModuleCCSessionError, ddns.DDNSServer,
-                          self.__cc_session)
-
-        self.__cc_session._raise_mods = {'Zonemgr'}
-        self.assertRaises(ModuleCCSessionError, ddns.DDNSServer,
-                          self.__cc_session)
+        # DDNServer, it should result in an exception and a few times
+        # of retries.  We test all possible cases, changing the number of
+        # raised exceptions and the type of exceptions that can happen,
+        # which should also cover the fatal error case.
+        for i in range(0, 4):
+            self.__check_remote_config_fail('Auth', i, ModuleCCSessionError)
+            self.__check_remote_config_fail('Auth', i, ModuleSpecError)
+            self.__check_remote_config_fail('Zonemgr', i, ModuleCCSessionError)
+            self.__check_remote_config_fail('Zonemgr', i, ModuleSpecError)
 
     def test_shutdown_command(self):
         '''Test whether the shutdown command works'''