Browse Source

[master] Merge branch 'trac1858'
commit.

JINMEI Tatuya 12 years ago
parent
commit
405d85c8a0

+ 24 - 7
src/bin/bind10/bind10_messages.mes

@@ -167,6 +167,30 @@ so BIND 10 will now shut down. The specific error is printed.
 % BIND10_SEND_SIGKILL sending SIGKILL to %1 (PID %2)
 % BIND10_SEND_SIGKILL sending SIGKILL to %1 (PID %2)
 The boss module is sending a SIGKILL signal to the given process.
 The boss module is sending a SIGKILL signal to the given process.
 
 
+% BIND10_SEND_SIGNAL_FAIL sending %1 to %2 (PID %3) failed: %4
+The boss module sent a single (either SIGTERM or SIGKILL) to a process,
+but it failed due to some system level error.  There are two major cases:
+the target process has already terminated but the boss module had sent
+the signal before it noticed the termination.  In this case an error
+message should indicate something like "no such process".  This can be
+safely ignored.  The other case is that the boss module doesn't have
+the privilege to send a signal to the process.  It can typically
+happen when the boss module started as a privileged process, spawned a
+subprocess, and then dropped the privilege.  It includes the case for
+the socket creator when the boss process runs with the -u command line
+option.  In this case, the boss module simply gives up to terminate
+the process explicitly because it's unlikely to succeed by keeping
+sending the signal.  Although the socket creator is implemented so
+that it will terminate automatically when the boss process exits
+(and that should be the case for any other future process running with
+a higher privilege), but it's recommended to check if there's any
+remaining BIND 10 process if this message is logged.  For all other
+cases, the boss module will keep sending the signal until it confirms
+all child processes terminate.  Although unlikely, this could prevent
+the boss module from exiting, just keeping sending the signals.  So,
+again, it's advisable to check if it really terminates when this
+message is logged.
+
 % BIND10_SEND_SIGTERM sending SIGTERM to %1 (PID %2)
 % BIND10_SEND_SIGTERM sending SIGTERM to %1 (PID %2)
 The boss module is sending a SIGTERM signal to the given process.
 The boss module is sending a SIGTERM signal to the given process.
 
 
@@ -274,13 +298,6 @@ During the startup process, a number of messages are exchanged between the
 Boss process and the processes it starts.  This error is output when a
 Boss process and the processes it starts.  This error is output when a
 message received by the Boss process is not recognised.
 message received by the Boss process is not recognised.
 
 
-% BIND10_START_AS_NON_ROOT_AUTH starting b10-auth as a user, not root. This might fail.
-The authoritative server is being started or restarted without root privileges.
-If the module needs these privileges, it may have problems starting.
-Note that this issue should be resolved by the pending 'socket-creator'
-process; once that has been implemented, modules should not need root
-privileges anymore. See tickets #800 and #801 for more information.
-
 % BIND10_START_AS_NON_ROOT_RESOLVER starting b10-resolver as a user, not root. This might fail.
 % BIND10_START_AS_NON_ROOT_RESOLVER starting b10-resolver as a user, not root. This might fail.
 The resolver is being started or restarted without root privileges.
 The resolver is being started or restarted without root privileges.
 If the module needs these privileges, it may have problems starting.
 If the module needs these privileges, it may have problems starting.

+ 30 - 22
src/bin/bind10/bind10_src.py.in

@@ -546,8 +546,6 @@ class BoB:
         """
         """
             Start the Authoritative server
             Start the Authoritative server
         """
         """
-        if self.uid is not None and self.__started:
-            logger.warn(BIND10_START_AS_NON_ROOT_AUTH)
         authargs = ['b10-auth']
         authargs = ['b10-auth']
         if self.verbose:
         if self.verbose:
             authargs += ['-v']
             authargs += ['-v']
@@ -693,32 +691,42 @@ class BoB:
         # from doing so
         # from doing so
         if not self.nokill:
         if not self.nokill:
             # next try sending a SIGTERM
             # next try sending a SIGTERM
-            components_to_stop = list(self.components.values())
-            for component in components_to_stop:
-                logger.info(BIND10_SEND_SIGTERM, component.name(), component.pid())
-                try:
-                    component.kill()
-                except OSError:
-                    # ignore these (usually ESRCH because the child
-                    # finally exited)
-                    pass
-            # finally, send SIGKILL (unmaskable termination) until everybody dies
+            self.__kill_children(False)
+            # finally, send SIGKILL (unmaskable termination) until everybody
+            # dies
             while self.components:
             while self.components:
                 # XXX: some delay probably useful... how much is uncertain
                 # XXX: some delay probably useful... how much is uncertain
                 time.sleep(0.1)
                 time.sleep(0.1)
                 self.reap_children()
                 self.reap_children()
-                components_to_stop = list(self.components.values())
-                for component in components_to_stop:
-                    logger.info(BIND10_SEND_SIGKILL, component.name(),
-                                component.pid())
-                    try:
-                        component.kill(True)
-                    except OSError:
-                        # ignore these (usually ESRCH because the child
-                        # finally exited)
-                        pass
+                self.__kill_children(True)
             logger.info(BIND10_SHUTDOWN_COMPLETE)
             logger.info(BIND10_SHUTDOWN_COMPLETE)
 
 
+    def __kill_children(self, forceful):
+        '''Terminate remaining subprocesses by sending a signal.
+
+        The forceful paramter will be passed Component.kill().
+        This is a dedicated subroutine of shutdown(), just to unify two
+        similar cases.
+
+        '''
+        logmsg = BIND10_SEND_SIGKILL if forceful else BIND10_SEND_SIGTERM
+        # We need to make a copy of values as the components may be modified
+        # in the loop.
+        for component in list(self.components.values()):
+            logger.info(logmsg, component.name(), component.pid())
+            try:
+                component.kill(forceful)
+            except OSError as ex:
+                # If kill() failed due to EPERM, it doesn't make sense to
+                # keep trying, so we just log the fact and forget that
+                # component.  Ignore other OSErrors (usually ESRCH because
+                # the child finally exited)
+                signame = "SIGKILL" if forceful else "SIGTERM"
+                logger.info(BIND10_SEND_SIGNAL_FAIL, signame,
+                            component.name(), component.pid(), ex)
+                if ex.errno == errno.EPERM:
+                    del self.components[component.pid()]
+
     def _get_process_exit_status(self):
     def _get_process_exit_status(self):
         return os.waitpid(-1, os.WNOHANG)
         return os.waitpid(-1, os.WNOHANG)
 
 

+ 34 - 2
src/bin/bind10/tests/bind10_test.py.in

@@ -1181,7 +1181,7 @@ class TestBossComponents(unittest.TestCase):
         # We check somewhere else that the shutdown is actually called
         # We check somewhere else that the shutdown is actually called
         # from there (the test_kills).
         # from there (the test_kills).
 
 
-    def __real_test_kill(self, nokill = False):
+    def __real_test_kill(self, nokill=False, ex_on_kill=None):
         """
         """
         Helper function that does the actual kill functionality testing.
         Helper function that does the actual kill functionality testing.
         """
         """
@@ -1195,8 +1195,23 @@ class TestBossComponents(unittest.TestCase):
             (anyway it is not told so). It does not die if it is killed
             (anyway it is not told so). It does not die if it is killed
             the first time. It dies only when killed forcefully.
             the first time. It dies only when killed forcefully.
             """
             """
+            def __init__(self):
+                # number of kill() calls, preventing infinite loop.
+                self.__call_count = 0
+
             def kill(self, forceful=False):
             def kill(self, forceful=False):
+                self.__call_count += 1
+                if self.__call_count > 2:
+                    raise Exception('Too many calls to ImmortalComponent.kill')
+
                 killed.append(forceful)
                 killed.append(forceful)
+                if ex_on_kill is not None:
+                    # If exception is given by the test, raise it here.
+                    # In the case of ESRCH, the process should have gone
+                    # somehow, so we clear the components.
+                    if ex_on_kill.errno == errno.ESRCH:
+                        bob.components = {}
+                    raise ex_on_kill
                 if forceful:
                 if forceful:
                     bob.components = {}
                     bob.components = {}
             def pid(self):
             def pid(self):
@@ -1224,7 +1239,10 @@ class TestBossComponents(unittest.TestCase):
         if nokill:
         if nokill:
             self.assertEqual([], killed)
             self.assertEqual([], killed)
         else:
         else:
-            self.assertEqual([False, True], killed)
+            if ex_on_kill is not None:
+                self.assertEqual([False], killed)
+            else:
+                self.assertEqual([False, True], killed)
 
 
         self.assertTrue(self.__called)
         self.assertTrue(self.__called)
 
 
@@ -1236,6 +1254,20 @@ class TestBossComponents(unittest.TestCase):
         """
         """
         self.__real_test_kill()
         self.__real_test_kill()
 
 
+    def test_kill_fail(self):
+        """Test cases where kill() results in an exception due to OS error.
+
+        The behavior should be different for EPERM, so we test two cases.
+
+        """
+
+        ex = OSError()
+        ex.errno, ex.strerror = errno.ESRCH, 'No such process'
+        self.__real_test_kill(ex_on_kill=ex)
+
+        ex.errno, ex.strerror = errno.EPERM, 'Operation not permitted'
+        self.__real_test_kill(ex_on_kill=ex)
+
     def test_nokill(self):
     def test_nokill(self):
         """
         """
         Test that the boss *doesn't* kill components which don't want to
         Test that the boss *doesn't* kill components which don't want to

+ 5 - 0
src/lib/python/isc/bind10/sockcreator.py

@@ -214,9 +214,14 @@ class Creator(Parser):
                                 socket.SOCK_STREAM)
                                 socket.SOCK_STREAM)
         env = copy.deepcopy(os.environ)
         env = copy.deepcopy(os.environ)
         env['PATH'] = path
         env['PATH'] = path
+        # We explicitly set close_fs to True; it's False by default before
+        # Python 3.2.  If we don't close the remaining FDs, the copy of
+        # 'local' will prevent the child process from terminating when
+        # the parent process died abruptly.
         self.__process = subprocess.Popen(['b10-sockcreator'], env=env,
         self.__process = subprocess.Popen(['b10-sockcreator'], env=env,
                                           stdin=remote.fileno(),
                                           stdin=remote.fileno(),
                                           stdout=remote2.fileno(),
                                           stdout=remote2.fileno(),
+                                          close_fds=True,
                                           preexec_fn=self.__preexec_work)
                                           preexec_fn=self.__preexec_work)
         remote.close()
         remote.close()
         remote2.close()
         remote2.close()