Browse Source

[1858] stop sending hopeless signal when it once fails with EPERM.

JINMEI Tatuya 12 years ago
parent
commit
9c655172a2

+ 24 - 0
src/bin/bind10/bind10_messages.mes

@@ -305,3 +305,27 @@ the configuration manager to start up.  The total length of time Boss
 will wait for the configuration manager before reporting an error is
 set with the command line --wait switch, which has a default value of
 ten seconds.
+
+% BIND10_SEND_SIGNAL_FAIL sending %1 to %2 (PID %3) failed: %4
+The boss module sent a single (either SIGTERM or SIGKILL) to a process,
+but it failed due to some system level error.  There are two major cases:
+the target process has already terminated but the boss module had sent
+the signal before it noticed the termination.  In this case an error
+message should indicate something like "no such process".  This can be
+safely ignored.  The other case is that the boss module doesn't have
+the privilege to send a signal to the process.  It can typically
+happen when the boss module started as a privileged process, spawned a
+subprocess, and then dropped the privilege.  It includes the case for
+the socket creator when the boss process runs with the -u command line
+option.  In this case, the boss module simply gives up to terminate
+the process explicitly because it's unlikely to succeed by keeping
+sending the signal.  Although the socket creator is implemented so
+that it will terminate automatically when the boss process exits
+(and that should be the case for any other future process running with
+a higher privilege), but it's recommended to check if there's any
+remaining BIND 10 process if this message is logged.  For all other
+cases, the boss module will keep sending the signal until it confirms
+all child processes terminate.  Although unlikely, this could prevent
+the boss module from exiting, just keeping sending the signals.  So,
+again, it's advisable to check if it really terminates when this
+message is logged.

+ 13 - 5
src/bin/bind10/bind10_src.py.in

@@ -712,14 +712,22 @@ class BoB:
 
         '''
         logmsg = BIND10_SEND_SIGKILL if forceful else BIND10_SEND_SIGTERM
-        for component in self.components.values():
+        # We need to make a copy of values as the components may be modified
+        # in the loop.
+        for component in list(self.components.values()):
             logger.info(logmsg, component.name(), component.pid())
             try:
                 component.kill(forceful)
-            except OSError:
-                # ignore these (usually ESRCH because the child
-                # finally exited)
-                pass
+            except OSError as ex:
+                # If kill() failed due to EPERM, it doesn't make sense to
+                # keep trying, so we just log the fact and forget that
+                # component.  Ignore other OSErrors (usually ESRCH because
+                # the child finally exited)
+                signame = "SIGKILL" if forceful else "SIGTERM"
+                logger.info(BIND10_SEND_SIGNAL_FAIL, signame,
+                            component.name(), component.pid(), ex)
+                if ex.errno == errno.EPERM:
+                    del self.components[component.pid()]
 
     def _get_process_exit_status(self):
         return os.waitpid(-1, os.WNOHANG)

+ 17 - 2
src/bin/bind10/tests/bind10_test.py.in

@@ -1195,7 +1195,15 @@ class TestBossComponents(unittest.TestCase):
             (anyway it is not told so). It does not die if it is killed
             the first time. It dies only when killed forcefully.
             """
+            def __init__(self):
+                # number of kill() calls, preventing infinite loop.
+                self.__call_count = 0
+
             def kill(self, forceful=False):
+                self.__call_count += 1
+                if self.__call_count > 2:
+                    raise Exception('Too many calls to ImmortalComponent.kill')
+
                 killed.append(forceful)
                 if ex_on_kill is not None:
                     # If exception is given by the test, raise it here.
@@ -1247,10 +1255,17 @@ class TestBossComponents(unittest.TestCase):
         self.__real_test_kill()
 
     def test_kill_fail(self):
-        """Test cases where kill() results in an exception due to OS error."""
+        """Test cases where kill() results in an exception due to OS error.
+
+        The behavior should be different for EPERM, so we test two cases.
+
+        """
 
         ex = OSError()
-        ex.errno = errno.ESRCH
+        ex.errno, ex.strerror = errno.ESRCH, 'No such process'
+        self.__real_test_kill(ex_on_kill=ex)
+
+        ex.errno, ex.strerror = errno.EPERM, 'Operation not permitted'
         self.__real_test_kill(ex_on_kill=ex)
 
     def test_nokill(self):