Browse Source

[1271] Update Boss to wait for configuration manager

Boss now waits to receive a message from the configuration manager
to say that it is running before starting other processes.
Stephen Morris 13 years ago
parent
commit
d69588a14a

+ 25 - 3
src/bin/bind10/bind10_messages.mes

@@ -28,6 +28,10 @@ started according to the configuration.
 This message shows whether or not the resolver should be
 This message shows whether or not the resolver should be
 started according to the configuration.
 started according to the configuration.
 
 
+% BIND10_INVALID_STATISTICS_DATA invalid specification of statistics data specified
+An error was encountered when the boss module specified
+statistics data which is invalid for the boss specification file.
+
 % BIND10_INVALID_USER invalid user: %1
 % BIND10_INVALID_USER invalid user: %1
 The boss process was started with the -u option, to drop root privileges
 The boss process was started with the -u option, to drop root privileges
 and continue running as the specified user, but the user is unknown.
 and continue running as the specified user, but the user is unknown.
@@ -184,6 +188,18 @@ All modules have been successfully started, and BIND 10 is now running.
 There was a fatal error when BIND10 was trying to start. The error is
 There was a fatal error when BIND10 was trying to start. The error is
 shown, and BIND10 will now shut down.
 shown, and BIND10 will now shut down.
 
 
+% BIND10_STARTUP_UNEXPECTED_MESSAGE unrecognised startup message %1
+During the startup process, a number of messages are exchanged between the
+Boss process and the processes it starts.  This error is output when a
+message received by the Boss process is recognised as being of the
+correct format but is unexpected.  It may be that processes are starting
+of sequence.
+
+% BIND10_STARTUP_UNRECOGNISED_MESSAGE unrecognised startup message %1
+During the startup process, a number of messages are exchanged between the
+Boss process and the processes it starts.  This error is output when a
+message received by the Boss process is not recognised.
+
 % BIND10_START_AS_NON_ROOT starting %1 as a user, not root. This might fail.
 % BIND10_START_AS_NON_ROOT starting %1 as a user, not root. This might fail.
 The given module is being started or restarted without root privileges.
 The given module is being started or restarted without root privileges.
 If the module needs these privileges, it may have problems starting.
 If the module needs these privileges, it may have problems starting.
@@ -199,6 +215,12 @@ the message channel.
 An unknown child process has exited. The PID is printed, but no further
 An unknown child process has exited. The PID is printed, but no further
 action will be taken by the boss process.
 action will be taken by the boss process.
 
 
-% BIND10_INVALID_STATISTICS_DATA invalid specification of statistics data specified
-An error was encountered when the boss module specified
-statistics data which is invalid for the boss specification file.
+% BIND10_WAIT_CFGMGR waiting for configuration manager process to start
+The configuration manager process is so critical to operation of BIND 10
+that after starting it, the Boss module will wait for it to initialize
+itself before continuing.  This debug message is produced during the
+wait and may be output zero or more times depending on how long it takes
+the configuration manager to start up.  The total length of time Boss
+will wait for the configuration manager before reporting an error is
+set with the command line --wait switch.
+

+ 45 - 7
src/bin/bind10/bind10_src.py.in

@@ -208,12 +208,14 @@ class ProcessInfo:
 
 
 class CChannelConnectError(Exception): pass
 class CChannelConnectError(Exception): pass
 
 
+class ProcessStartError(Exception): pass
+
 class BoB:
 class BoB:
     """Boss of BIND class."""
     """Boss of BIND class."""
     
     
     def __init__(self, msgq_socket_file=None, data_path=None,
     def __init__(self, msgq_socket_file=None, data_path=None,
     config_filename=None, nocache=False, verbose=False, setuid=None,
     config_filename=None, nocache=False, verbose=False, setuid=None,
-    username=None, cmdctl_port=None, brittle=False):
+    username=None, cmdctl_port=None, brittle=False, pwait_time=10):
         """
         """
             Initialize the Boss of BIND. This is a singleton (only one can run).
             Initialize the Boss of BIND. This is a singleton (only one can run).
         
         
@@ -249,6 +251,7 @@ class BoB:
         self.config_filename = config_filename
         self.config_filename = config_filename
         self.cmdctl_port = cmdctl_port
         self.cmdctl_port = cmdctl_port
         self.brittle = brittle
         self.brittle = brittle
+        self.pwait_time = pwait_time
         self.sockcreator = None
         self.sockcreator = None
 
 
     def config_handler(self, new_config):
     def config_handler(self, new_config):
@@ -435,6 +438,27 @@ class BoB:
         else:
         else:
             logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS_PID, self.curproc, pid)
             logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS_PID, self.curproc, pid)
 
 
+    def process_running(self, msg, who):
+        """
+            Some processes return a message to the Boss after they have
+            started to indicate that they are running.  The form of the
+            message is a dictionary with contents {"running:", "<process>"}.
+            This method checks the passed message and returns True if the
+            "who" process is contained in the message (so is presumably
+            running).  It returns False for all other conditions and will
+            log an error if appropriate.
+        """
+        if msg is not None:
+            try:
+                if msg["running"] == who:
+                    return True
+                else:
+                    logger.error(BIND10_STARTUP_UNEXPECTED_MESSAGE, msg)
+            except:
+                logger.error(BIND10_STARTUP_UNRECOGNISED_MESSAGE, msg)
+        
+        return False
+
     # The next few methods start the individual processes of BIND-10.  They
     # The next few methods start the individual processes of BIND-10.  They
     # are called via start_all_processes().  If any fail, an exception is
     # are called via start_all_processes().  If any fail, an exception is
     # raised which is caught by the caller of start_all_processes(); this kills
     # raised which is caught by the caller of start_all_processes(); this kills
@@ -465,6 +489,10 @@ class BoB:
             except isc.cc.session.SessionError:
             except isc.cc.session.SessionError:
                 time.sleep(0.1)
                 time.sleep(0.1)
 
 
+        # Subscribe to the message queue.  The only messages we expect to receive
+        # on this channel are once relating to process startup.
+        self.cc_session.group_subscribe("Boss")
+
     def start_cfgmgr(self, c_channel_env):
     def start_cfgmgr(self, c_channel_env):
         """
         """
             Starts the configuration manager process
             Starts the configuration manager process
@@ -482,11 +510,18 @@ class BoB:
         self.processes[bind_cfgd.pid] = bind_cfgd
         self.processes[bind_cfgd.pid] = bind_cfgd
         self.log_started(bind_cfgd.pid)
         self.log_started(bind_cfgd.pid)
 
 
-        # sleep until b10-cfgmgr is fully up and running, this is a good place
-        # to have a (short) timeout on synchronized groupsend/receive
-        # TODO: replace the sleep by a listen for ConfigManager started
-        # message
-        time.sleep(1)
+        # Wait for the configuration manager to start up.  The amount of time
+        # can be set on the command line.
+        time_remaining = self.pwait_time
+        msg, env = self.cc_session.group_recvmsg()
+        while time_remaining > 0 and not self.process_running(msg, "ConfigManager"):
+            logger.debug(DBG_PROCESS, BIND10_WAIT_CFGMGR)
+            time.sleep(1)
+            time_remaining = time_remaining - 1
+            msg, env = self.cc_session.group_recvmsg()
+        
+        if not self.process_running(msg, "ConfigManager"):
+            raise ProcessStartError("Configuration manager process has not started")
 
 
     def start_ccsession(self, c_channel_env):
     def start_ccsession(self, c_channel_env):
         """
         """
@@ -946,6 +981,8 @@ def parse_args(args=sys.argv[1:], Parser=OptionParser):
                       help="file to dump the PID of the BIND 10 process")
                       help="file to dump the PID of the BIND 10 process")
     parser.add_option("--brittle", dest="brittle", action="store_true",
     parser.add_option("--brittle", dest="brittle", action="store_true",
                       help="debugging flag: exit if any component dies")
                       help="debugging flag: exit if any component dies")
+    parser.add_option("-w", "--wait", dest="pwait_time", type="int",
+                      default=10, help="Time to wait for config manager to start up")
 
 
     (options, args) = parser.parse_args(args)
     (options, args) = parser.parse_args(args)
 
 
@@ -1048,7 +1085,8 @@ def main():
     # Go bob!
     # Go bob!
     boss_of_bind = BoB(options.msgq_socket_file, options.data_path,
     boss_of_bind = BoB(options.msgq_socket_file, options.data_path,
                        options.config_file, options.nocache, options.verbose,
                        options.config_file, options.nocache, options.verbose,
-                       setuid, username, options.cmdctl_port, options.brittle)
+                       setuid, username, options.cmdctl_port, options.brittle,
+                       options.pwait_time)
     startup_result = boss_of_bind.startup()
     startup_result = boss_of_bind.startup()
     if startup_result:
     if startup_result:
         logger.fatal(BIND10_STARTUP_ERROR, startup_result)
         logger.fatal(BIND10_STARTUP_ERROR, startup_result)

+ 1 - 1
src/lib/python/isc/config/cfgmgr.py

@@ -202,7 +202,7 @@ class ConfigManager:
 
 
     def notify_boss(self):
     def notify_boss(self):
         """Notifies the Boss module that the Config Manager is running"""
         """Notifies the Boss module that the Config Manager is running"""
-        self.cc.group_sendmsg({"running": "configmanager"}, "Boss")
+        self.cc.group_sendmsg({"running": "ConfigManager"}, "Boss")
 
 
     def set_module_spec(self, spec):
     def set_module_spec(self, spec):
         """Adds a ModuleSpec"""
         """Adds a ModuleSpec"""