Browse Source

[master] Merge branch 'trac1858'
commit.

JINMEI Tatuya 12 years ago
parent
commit
405d85c8a0

+ 24 - 7
src/bin/bind10/bind10_messages.mes

@@ -167,6 +167,30 @@ so BIND 10 will now shut down. The specific error is printed.
 % BIND10_SEND_SIGKILL sending SIGKILL to %1 (PID %2)
 The boss module is sending a SIGKILL signal to the given process.
 
+% BIND10_SEND_SIGNAL_FAIL sending %1 to %2 (PID %3) failed: %4
+The boss module sent a single (either SIGTERM or SIGKILL) to a process,
+but it failed due to some system level error.  There are two major cases:
+the target process has already terminated but the boss module had sent
+the signal before it noticed the termination.  In this case an error
+message should indicate something like "no such process".  This can be
+safely ignored.  The other case is that the boss module doesn't have
+the privilege to send a signal to the process.  It can typically
+happen when the boss module started as a privileged process, spawned a
+subprocess, and then dropped the privilege.  It includes the case for
+the socket creator when the boss process runs with the -u command line
+option.  In this case, the boss module simply gives up to terminate
+the process explicitly because it's unlikely to succeed by keeping
+sending the signal.  Although the socket creator is implemented so
+that it will terminate automatically when the boss process exits
+(and that should be the case for any other future process running with
+a higher privilege), but it's recommended to check if there's any
+remaining BIND 10 process if this message is logged.  For all other
+cases, the boss module will keep sending the signal until it confirms
+all child processes terminate.  Although unlikely, this could prevent
+the boss module from exiting, just keeping sending the signals.  So,
+again, it's advisable to check if it really terminates when this
+message is logged.
+
 % BIND10_SEND_SIGTERM sending SIGTERM to %1 (PID %2)
 The boss module is sending a SIGTERM signal to the given process.
 
@@ -274,13 +298,6 @@ During the startup process, a number of messages are exchanged between the
 Boss process and the processes it starts.  This error is output when a
 message received by the Boss process is not recognised.
 
-% BIND10_START_AS_NON_ROOT_AUTH starting b10-auth as a user, not root. This might fail.
-The authoritative server is being started or restarted without root privileges.
-If the module needs these privileges, it may have problems starting.
-Note that this issue should be resolved by the pending 'socket-creator'
-process; once that has been implemented, modules should not need root
-privileges anymore. See tickets #800 and #801 for more information.
-
 % BIND10_START_AS_NON_ROOT_RESOLVER starting b10-resolver as a user, not root. This might fail.
 The resolver is being started or restarted without root privileges.
 If the module needs these privileges, it may have problems starting.

+ 30 - 22
src/bin/bind10/bind10_src.py.in

@@ -546,8 +546,6 @@ class BoB:
         """
             Start the Authoritative server
         """
-        if self.uid is not None and self.__started:
-            logger.warn(BIND10_START_AS_NON_ROOT_AUTH)
         authargs = ['b10-auth']
         if self.verbose:
             authargs += ['-v']
@@ -693,32 +691,42 @@ class BoB:
         # from doing so
         if not self.nokill:
             # next try sending a SIGTERM
-            components_to_stop = list(self.components.values())
-            for component in components_to_stop:
-                logger.info(BIND10_SEND_SIGTERM, component.name(), component.pid())
-                try:
-                    component.kill()
-                except OSError:
-                    # ignore these (usually ESRCH because the child
-                    # finally exited)
-                    pass
-            # finally, send SIGKILL (unmaskable termination) until everybody dies
+            self.__kill_children(False)
+            # finally, send SIGKILL (unmaskable termination) until everybody
+            # dies
             while self.components:
                 # XXX: some delay probably useful... how much is uncertain
                 time.sleep(0.1)
                 self.reap_children()
-                components_to_stop = list(self.components.values())
-                for component in components_to_stop:
-                    logger.info(BIND10_SEND_SIGKILL, component.name(),
-                                component.pid())
-                    try:
-                        component.kill(True)
-                    except OSError:
-                        # ignore these (usually ESRCH because the child
-                        # finally exited)
-                        pass
+                self.__kill_children(True)
             logger.info(BIND10_SHUTDOWN_COMPLETE)
 
+    def __kill_children(self, forceful):
+        '''Terminate remaining subprocesses by sending a signal.
+
+        The forceful paramter will be passed Component.kill().
+        This is a dedicated subroutine of shutdown(), just to unify two
+        similar cases.
+
+        '''
+        logmsg = BIND10_SEND_SIGKILL if forceful else BIND10_SEND_SIGTERM
+        # We need to make a copy of values as the components may be modified
+        # in the loop.
+        for component in list(self.components.values()):
+            logger.info(logmsg, component.name(), component.pid())
+            try:
+                component.kill(forceful)
+            except OSError as ex:
+                # If kill() failed due to EPERM, it doesn't make sense to
+                # keep trying, so we just log the fact and forget that
+                # component.  Ignore other OSErrors (usually ESRCH because
+                # the child finally exited)
+                signame = "SIGKILL" if forceful else "SIGTERM"
+                logger.info(BIND10_SEND_SIGNAL_FAIL, signame,
+                            component.name(), component.pid(), ex)
+                if ex.errno == errno.EPERM:
+                    del self.components[component.pid()]
+
     def _get_process_exit_status(self):
         return os.waitpid(-1, os.WNOHANG)
 

+ 34 - 2
src/bin/bind10/tests/bind10_test.py.in

@@ -1181,7 +1181,7 @@ class TestBossComponents(unittest.TestCase):
         # We check somewhere else that the shutdown is actually called
         # from there (the test_kills).
 
-    def __real_test_kill(self, nokill = False):
+    def __real_test_kill(self, nokill=False, ex_on_kill=None):
         """
         Helper function that does the actual kill functionality testing.
         """
@@ -1195,8 +1195,23 @@ class TestBossComponents(unittest.TestCase):
             (anyway it is not told so). It does not die if it is killed
             the first time. It dies only when killed forcefully.
             """
+            def __init__(self):
+                # number of kill() calls, preventing infinite loop.
+                self.__call_count = 0
+
             def kill(self, forceful=False):
+                self.__call_count += 1
+                if self.__call_count > 2:
+                    raise Exception('Too many calls to ImmortalComponent.kill')
+
                 killed.append(forceful)
+                if ex_on_kill is not None:
+                    # If exception is given by the test, raise it here.
+                    # In the case of ESRCH, the process should have gone
+                    # somehow, so we clear the components.
+                    if ex_on_kill.errno == errno.ESRCH:
+                        bob.components = {}
+                    raise ex_on_kill
                 if forceful:
                     bob.components = {}
             def pid(self):
@@ -1224,7 +1239,10 @@ class TestBossComponents(unittest.TestCase):
         if nokill:
             self.assertEqual([], killed)
         else:
-            self.assertEqual([False, True], killed)
+            if ex_on_kill is not None:
+                self.assertEqual([False], killed)
+            else:
+                self.assertEqual([False, True], killed)
 
         self.assertTrue(self.__called)
 
@@ -1236,6 +1254,20 @@ class TestBossComponents(unittest.TestCase):
         """
         self.__real_test_kill()
 
+    def test_kill_fail(self):
+        """Test cases where kill() results in an exception due to OS error.
+
+        The behavior should be different for EPERM, so we test two cases.
+
+        """
+
+        ex = OSError()
+        ex.errno, ex.strerror = errno.ESRCH, 'No such process'
+        self.__real_test_kill(ex_on_kill=ex)
+
+        ex.errno, ex.strerror = errno.EPERM, 'Operation not permitted'
+        self.__real_test_kill(ex_on_kill=ex)
+
     def test_nokill(self):
         """
         Test that the boss *doesn't* kill components which don't want to

+ 5 - 0
src/lib/python/isc/bind10/sockcreator.py

@@ -214,9 +214,14 @@ class Creator(Parser):
                                 socket.SOCK_STREAM)
         env = copy.deepcopy(os.environ)
         env['PATH'] = path
+        # We explicitly set close_fs to True; it's False by default before
+        # Python 3.2.  If we don't close the remaining FDs, the copy of
+        # 'local' will prevent the child process from terminating when
+        # the parent process died abruptly.
         self.__process = subprocess.Popen(['b10-sockcreator'], env=env,
                                           stdin=remote.fileno(),
                                           stdout=remote2.fileno(),
+                                          close_fds=True,
                                           preexec_fn=self.__preexec_work)
         remote.close()
         remote2.close()