Browse Source

[988] do not continue after recv_fd fails, which would cause infinite loop.

I did some other cleanups:
- log the failure regardless of the exact error code returned by recv_fd
  (in fact, when it fails it should always be FD_SYSTEM_ERROR in practice).
- remove the code comment about that.  It talked about rather unusual case
  and could be misleading.  And, now the detailed version of log message
  provides sufficient information.
- revise the detailed log message so it will explain more likely reason
  for that and clarify what the admin should do.
JINMEI Tatuya 13 years ago
parent
commit
9a24a3143e

+ 40 - 0
src/bin/xfrout/tests/xfrout_test.py.in

@@ -60,6 +60,9 @@ class MySocket():
         self.sendqueue.extend(data);
         return len(data)
 
+    def fileno(self):
+        return 42               # simply return a constant dummy value
+
     def readsent(self):
         if len(self.sendqueue) >= 2:
             size = 2 + struct.unpack("!H", self.sendqueue[:2])[0]
@@ -1155,6 +1158,15 @@ class TestUnixSockServer(unittest.TestCase):
     def setUp(self):
         self.write_sock, self.read_sock = socket.socketpair()
         self.unix = MyUnixSockServer()
+        # Some test below modify these module-wide attributes.  We'll need
+        # to restore them at the end of each test, so we remember them here.
+        self.__select_bak = xfrout.select.select
+        self.__recv_fd_back = xfrout.recv_fd
+
+    def tearDown(self):
+        # Restore possibly faked module-wide attributes.
+        xfrout.select.select = self.__select_bak
+        xfrout.recv_fd = self.__recv_fd_back
 
     def test_tsig_keyring(self):
         """
@@ -1414,6 +1426,34 @@ class TestUnixSockServer(unittest.TestCase):
         sys.stdout = old_stdout
         os.rmdir(dir_name)
 
+    def __fake_select(self, r, w, e):
+        '''select emulator used in select_loop_fail test.'''
+        # This simplified faked function assumes to be called at most once,
+        # and in that case just return a pre-configured "readable" sockets.
+        if self.__select_count > 0:
+            raise RuntimeError('select called unexpected number of times')
+        self.__select_count += 1
+        return (self.__select_return_redable, [], [])
+
+    def test_select_loop_fail(self):
+        '''Check failure events in the main loop.'''
+        # setup faked select() environments
+        self.unix._read_sock = MySocket(socket.AF_INET6, socket.SOCK_STREAM)
+        xfrout.select.select = self.__fake_select
+        self.__select_return_redable = [MySocket(socket.AF_INET6,
+                                                 socket.SOCK_STREAM)]
+
+        # Check that loop terminates if recv_fd() fails.
+        for ret_code in [-1, FD_SYSTEM_ERROR]:
+            # fake recv_fd so it returns the faked failure code.
+            xfrout.recv_fd = lambda fileno: ret_code
+
+            # reset the counter, go to the loop.
+            self.__select_count = 0
+            self.unix._select_loop(self.__select_return_redable[0])
+            # select should have been called exactly once.
+            self.assertEqual(1, self.__select_count)
+
 class TestInitialization(unittest.TestCase):
     def setEnv(self, name, value):
         if value is None:

+ 12 - 9
src/bin/xfrout/xfrout.py.in

@@ -708,7 +708,8 @@ class UnixSockServer(socketserver_mixin.NoPollMixIn,
                 continue
 
             try:
-                self.process_request(request_sock)
+                if not self.process_request(request_sock):
+                    break
             except Exception as pre:
                 logger.error(XFROUT_PROCESS_REQUEST_ERROR, pre)
                 break
@@ -722,26 +723,28 @@ class UnixSockServer(socketserver_mixin.NoPollMixIn,
 
     def process_request(self, request):
         """Receive socket fd and query message from auth, then
-        start a new thread to process the request."""
+        start a new thread to process the request.
+
+        Return: True if everything is okay; otherwise False, in which case
+        the calling thread will terminate.
+
+        """
         sock_fd = recv_fd(request.fileno())
         if sock_fd < 0:
-            # This may happen when one xfrout process try to connect to
-            # xfrout unix socket server, to check whether there is another
-            # xfrout running.
-            if sock_fd == FD_SYSTEM_ERROR:
-                logger.error(XFROUT_RECEIVE_FILE_DESCRIPTOR_ERROR)
-            return
+            logger.warn(XFROUT_RECEIVE_FILE_DESCRIPTOR_ERROR)
+            return False
 
         # receive request msg
         request_data = self._receive_query_message(request)
         if not request_data:
-            return
+            return True
 
         t = threading.Thread(target=self.finish_request,
                              args=(sock_fd, request_data))
         if self.daemon_threads:
             t.daemon = True
         t.start()
+        return True
 
     def _guess_remote(self, sock_fd):
         """Guess remote address and port of the socket.

+ 10 - 3
src/bin/xfrout/xfrout_messages.mes

@@ -149,9 +149,16 @@ and will now shut down.
 
 % XFROUT_RECEIVE_FILE_DESCRIPTOR_ERROR error receiving the file descriptor for an XFR connection
 There was an error receiving the file descriptor for the transfer
-request. Normally, the request is received by b10-auth, and passed on
-to the xfrout daemon, so it can answer directly. However, there was a
-problem receiving this file descriptor. The request will be ignored.
+request from b10-auth.  There can be several reasons for this, but
+the most likely cause is that b10-auth terminates for some reason
+(maybe it's a bug of b10-auth, maybe it's an intentional restart by
+the administrator), so depending on how this happens it may or may not
+be a serious error.  But in any case this is not expected to happen
+frequently, and it's advisable to figure out how this happened if
+this message is logged.  Even if this error happens xfrout will reset
+its internal state and will keep receiving further requests.  So
+if it's just a temporary restart of b10-auth the administrator does
+not have to do anything.
 
 % XFROUT_REMOVE_OLD_UNIX_SOCKET_FILE_ERROR error removing unix socket file %1: %2
 The unix socket file xfrout needs for contact with the auth daemon