Browse Source

Add class to track process information.
Restart killed processes.



git-svn-id: svn://bind10.isc.org/svn/bind10/branches/f2f200910@228 e5f2f494-b856-4b98-b285-d166d9295462

Shane Kerr 15 years ago
parent
commit
4c54d5231a
1 changed files with 121 additions and 63 deletions
  1. 121 63
      src/bin/bind10/bind10.py

+ 121 - 63
src/bin/bind10/bind10.py

@@ -25,16 +25,40 @@ import sys
 import re
 import re
 import errno
 import errno
 import time
 import time
+import select
 from optparse import OptionParser, OptionValueError
 from optparse import OptionParser, OptionValueError
 
 
 import ISC.CC
 import ISC.CC
 
 
 # This is the version that gets displayed to the user.
 # This is the version that gets displayed to the user.
-__version__ = "v20091028 (Paving the DNS Parking Lot)"
+__version__ = "v20091030 (Paving the DNS Parking Lot)"
 
 
 # Nothing at all to do with the 1990-12-10 article here:
 # Nothing at all to do with the 1990-12-10 article here:
 # http://www.subgenius.com/subg-digest/v2/0056.html
 # http://www.subgenius.com/subg-digest/v2/0056.html
 
 
+class ProcessInfo:
+    """Information about a process"""
+
+    dev_null = open("/dev/null", "w")
+
+    def _spawn(self):
+        self.process = subprocess.Popen(self.args,
+                                        stdin=subprocess.PIPE,
+                                        stdout=self.dev_null,
+                                        stderr=self.dev_null,
+                                        close_fds=True,
+                                        env=self.env,)
+        self.pid = self.process.pid
+
+    def __init__(self, name, args, env={}):
+        self.name = name 
+        self.args = args
+        self.env = env
+        self._spawn()
+
+    def respawn(self):
+        self._spawn()
+
 class BoB:
 class BoB:
     """Boss of BIND class."""
     """Boss of BIND class."""
     def __init__(self, c_channel_port=9912, verbose=False):
     def __init__(self, c_channel_port=9912, verbose=False):
@@ -47,11 +71,10 @@ class BoB:
         """
         """
         self.verbose = True
         self.verbose = True
         self.c_channel_port = c_channel_port
         self.c_channel_port = c_channel_port
-        self.cc_process = None
         self.cc_session = None
         self.cc_session = None
         self.processes = {}
         self.processes = {}
         self.dead_processes = {}
         self.dead_processes = {}
-        self.component_processes = {}
+        self.runnable = False
 
 
     def startup(self):
     def startup(self):
         """Start the BoB instance.
         """Start the BoB instance.
@@ -59,23 +82,17 @@ class BoB:
         Returns None if successful, otherwise an string describing the
         Returns None if successful, otherwise an string describing the
         problem.
         problem.
         """
         """
-        dev_null = open("/dev/null", "w")
         # start the c-channel daemon
         # start the c-channel daemon
         if self.verbose:
         if self.verbose:
             sys.stdout.write("Starting msgq using port %d\n" % self.c_channel_port)
             sys.stdout.write("Starting msgq using port %d\n" % self.c_channel_port)
         c_channel_env = { "ISC_MSGQ_PORT": str(self.c_channel_port), }
         c_channel_env = { "ISC_MSGQ_PORT": str(self.c_channel_port), }
         try:
         try:
-            c_channel = subprocess.Popen("msgq",
-                                         stdin=subprocess.PIPE,
-                                         stdout=dev_null,
-                                         stderr=dev_null,
-                                         close_fds=True,
-                                         env=c_channel_env,)
+            c_channel = ProcessInfo("msgq", "msgq", c_channel_env)
         except:
         except:
             return "Unable to start msgq"
             return "Unable to start msgq"
         self.processes[c_channel.pid] = c_channel
         self.processes[c_channel.pid] = c_channel
         if self.verbose:
         if self.verbose:
-            sys.stdout.write("Started msgq with PID %d\n" % c_channel.pid)
+            sys.stdout.write("Started msgq (PID %d)\n" % c_channel.pid)
 
 
         # now connect to the c-channel
         # now connect to the c-channel
         cc_connect_start = time.time()
         cc_connect_start = time.time()
@@ -95,42 +112,30 @@ class BoB:
         if self.verbose:
         if self.verbose:
             sys.stdout.write("Starting bind-cfgd\n")
             sys.stdout.write("Starting bind-cfgd\n")
         try:
         try:
-            bind_cfgd = subprocess.Popen("bind-cfgd",
-                                         stdin=dev_null,
-                                         stdout=dev_null,
-                                         stderr=dev_null,
-                                         close_fds=True,
-                                         env={},)
+            bind_cfgd = ProcessInfo("bind-cfgd", "bind-cfgd")
         except:
         except:
-            c_channel.kill()
+            c_channel.process.kill()
             return "Unable to start bind-cfgd"
             return "Unable to start bind-cfgd"
         self.processes[bind_cfgd.pid] = bind_cfgd
         self.processes[bind_cfgd.pid] = bind_cfgd
         if self.verbose:
         if self.verbose:
-            sys.stdout.write("Started bind-cfgd with PID %d\n" % bind_cfgd.pid)
+            sys.stdout.write("Started bind-cfgd (PID %d)\n" % bind_cfgd.pid)
 
 
         # start the parking lot
         # start the parking lot
         # XXX: this must be read from the configuration manager in the future
         # XXX: this must be read from the configuration manager in the future
         # XXX: we hardcode port 5300
         # XXX: we hardcode port 5300
         if self.verbose:
         if self.verbose:
-            sys.stdout.write("Starting parkinglot\n")
+            sys.stdout.write("Starting parkinglot on port 5300\n")
         try:
         try:
-            parkinglot = subprocess.Popen(["parkinglot", "-p", "5300",],
-                                          stdin=dev_null,
-                                          stdout=dev_null,
-                                          stderr=dev_null,
-                                          close_fds=True,
-                                          env={},)
+            parkinglot = ProcessInfo("parkinglot", ["parkinglot", "-p", "5300"])
         except:
         except:
             c_channel.kill()
             c_channel.kill()
             bind_cfgd.kill()
             bind_cfgd.kill()
             return "Unable to start parkinglot"
             return "Unable to start parkinglot"
         self.processes[parkinglot.pid] = parkinglot
         self.processes[parkinglot.pid] = parkinglot
         if self.verbose:
         if self.verbose:
-            sys.stdout.write("Started parkinglot with PID %d\n" % parkinglot.pid)
+            sys.stdout.write("Started parkinglot (PID %d)\n" % parkinglot.pid)
 
 
-        # remember our super-important process
-        self.cc_process = c_channel
-        
+        self.runnable = True
         return None
         return None
 
 
     def stop_all_processes(self):
     def stop_all_processes(self):
@@ -147,36 +152,37 @@ class BoB:
         if self.verbose:
         if self.verbose:
             sys.stdout.write("Stopping the server.\n")
             sys.stdout.write("Stopping the server.\n")
         # first try using the BIND 10 request to stop
         # first try using the BIND 10 request to stop
-        if self.cc_session:
-            try:
-                self.stop_all_processes()
-            except:
-                pass
+        try:
+            self.stop_all_processes()
+        except:
+            pass
         time.sleep(0.1)  # XXX: some delay probably useful... how much is uncertain
         time.sleep(0.1)  # XXX: some delay probably useful... how much is uncertain
         # next try sending a SIGTERM
         # next try sending a SIGTERM
         processes_to_stop = list(self.processes.values())
         processes_to_stop = list(self.processes.values())
         unstopped_processes = []
         unstopped_processes = []
-        for process in processes_to_stop:
+        for proc_info in processes_to_stop:
             if self.verbose:
             if self.verbose:
-                sys.stdout.write("Sending SIGTERM to process %d.\n" % process.pid)
+                sys.stdout.write("Sending SIGTERM to %s (PID %d).\n" % 
+                                 (proc_info.name, proc_info.pid))
             try:
             try:
-                process.terminate()
+                proc_info.process.terminate()
             except OSError as o:
             except OSError as o:
                 # ignore these (usually ESRCH because the child
                 # ignore these (usually ESRCH because the child
                 # finally exited)
                 # finally exited)
                 pass
                 pass
         time.sleep(0.1)  # XXX: some delay probably useful... how much is uncertain
         time.sleep(0.1)  # XXX: some delay probably useful... how much is uncertain
-        for process in processes_to_stop:
-            (pid, exit_status) = os.waitpid(process.pid, os.WNOHANG)
+        for proc_info in processes_to_stop:
+            (pid, exit_status) = os.waitpid(proc_info.pid, os.WNOHANG)
             if pid == 0:
             if pid == 0:
-                unstopped_processes.append(process)
+                unstopped_processes.append(proc_info)
         # finally, send a SIGKILL (unmaskable termination)
         # finally, send a SIGKILL (unmaskable termination)
         processes_to_stop = unstopped_processes
         processes_to_stop = unstopped_processes
-        for process in processes_to_stop:
+        for proc_info in processes_to_stop:
             if self.verbose:
             if self.verbose:
-                sys.stdout.write("Sending SIGKILL to process %d.\n" % process.pid)
+                sys.stdout.write("Sending SIGKILL to %s (PID %d).\n" % 
+                                 (proc_info.name, proc_info.pid))
             try:
             try:
-                process.kill()
+                proc_info.process.kill()
             except OSError as o:
             except OSError as o:
                 # ignore these (usually ESRCH because the child
                 # ignore these (usually ESRCH because the child
                 # finally exited)
                 # finally exited)
@@ -192,16 +198,41 @@ class BoB:
         Returns True if everything is okay, or False if a fatal error
         Returns True if everything is okay, or False if a fatal error
         has been detected and the program should exit.
         has been detected and the program should exit.
         """
         """
-        process = self.processes.pop(pid)
-        self.dead_processes[process.pid] = process
+        proc_info = self.processes.pop(pid)
+        self.dead_processes[proc_info.pid] = proc_info
         if self.verbose:
         if self.verbose:
-            sys.stdout.write("Process %d died.\n" % pid)
-        if self.cc_process and (pid == self.cc_process.pid):
+            sys.stdout.write("Process %s (PID %d) died.\n" % 
+                             (proc_info.name, proc_info.pid))
+        if proc_info.name == "msgq":
             if self.verbose:
             if self.verbose:
                 sys.stdout.write("The msgq process died, shutting down.\n")
                 sys.stdout.write("The msgq process died, shutting down.\n")
-            return False
-        else:
-            return True
+            self.runnable = False
+
+    def recv_and_process_cc_msg(self):
+        """Receive and process the next message on the c-channel,
+        if any."""
+        routing, data = self.cc_session.group_recvmsg(False)
+        print("routing", routing)
+        print("data", data)
+
+    def restart_processes(self):
+        """Restart any dead processes."""
+        # XXX: this needs a back-off algorithm
+        still_dead = {}
+        for proc_info in self.dead_processes.values():
+            if self.verbose:
+                sys.stdout.write("Resurrecting dead %s process...\n" % 
+                                 proc_info.name)
+            try:
+                proc_info.respawn()
+                self.processes[proc_info.pid] = proc_info
+                if self.verbose:
+                    sys.stdout.write("Resurrected %s (PID %d)\n" %
+                                     (proc_info.name, proc_info.pid))
+            except:
+                still_dead[proc_info.pid] = proc_info
+        # remember any processes that refuse to be resurrected
+        self.dead_processes = still_dead
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     def reaper(signal_number, stack_frame):
     def reaper(signal_number, stack_frame):
@@ -214,13 +245,9 @@ if __name__ == "__main__":
                 if o.errno == errno.ECHILD: break
                 if o.errno == errno.ECHILD: break
                 raise
                 raise
             if pid == 0: break
             if pid == 0: break
-            if not boss_of_bind.reap(pid, exit_status):
-                signal.signal(signal.SIGCHLD, signal.SIG_DFL)
-                boss_of_bind.shutdown()
-                sys.exit(0)
+            if boss_of_bind:
+                boss_of_bind.reap(pid, exit_status)
                    
                    
-                
-
     def get_signame(signal_number):
     def get_signame(signal_number):
         """Return the symbolic name for a signal."""
         """Return the symbolic name for a signal."""
         for sig in dir(signal):
         for sig in dir(signal):
@@ -232,14 +259,11 @@ if __name__ == "__main__":
     # XXX: perhaps register atexit() function and invoke that instead
     # XXX: perhaps register atexit() function and invoke that instead
     def fatal_signal(signal_number, stack_frame):
     def fatal_signal(signal_number, stack_frame):
         """We need to exit (SIGINT or SIGTERM received)."""
         """We need to exit (SIGINT or SIGTERM received)."""
-        global boss_of_bind
         global options
         global options
         if options.verbose:
         if options.verbose:
             sys.stdout.write("Received %s.\n" % get_signame(signal_number))
             sys.stdout.write("Received %s.\n" % get_signame(signal_number))
         signal.signal(signal.SIGCHLD, signal.SIG_DFL)
         signal.signal(signal.SIGCHLD, signal.SIG_DFL)
-        if boss_of_bind:
-            boss_of_bind.shutdown()
-        sys.exit(0)
+        boss_of_bind.runnable = False
 
 
     def check_port(option, opt_str, value, parser):
     def check_port(option, opt_str, value, parser):
         """Function to insure that the port we are passed is actually 
         """Function to insure that the port we are passed is actually 
@@ -265,6 +289,10 @@ if __name__ == "__main__":
     #       http://code.google.com/p/procname/
     #       http://code.google.com/p/procname/
     #       http://github.com/lericson/procname/
     #       http://github.com/lericson/procname/
 
 
+    # Create wakeup pipe for signal handlers
+    wakeup_pipe = os.pipe()
+    signal.set_wakeup_fd(wakeup_pipe[1])
+
     # Set signal handlers for catching child termination, as well
     # Set signal handlers for catching child termination, as well
     # as our own demise.
     # as our own demise.
     signal.signal(signal.SIGCHLD, reaper)
     signal.signal(signal.SIGCHLD, reaper)
@@ -279,6 +307,36 @@ if __name__ == "__main__":
         sys.stderr.write("Error on startup: %s\n" % startup_result)
         sys.stderr.write("Error on startup: %s\n" % startup_result)
         sys.exit(1)
         sys.exit(1)
 
 
-    while True:
-        time.sleep(1)
+    # In our main loop, we check for dead processes or messages 
+    # on the c-channel.
+    event_poller = select.poll()
+    wakeup_fd = wakeup_pipe[0]
+    event_poller.register(wakeup_fd, select.POLLIN)
+    cc_fd = boss_of_bind.cc_session._socket.fileno()
+    event_poller.register(cc_fd, select.POLLIN)
+    while boss_of_bind.runnable:
+        # XXX: get time for next restart for poll
+
+        # poll() can raise EINTR when a signal arrives, 
+        # even if they are resumable, so we have to catch
+        # the exception
+        try:
+            events = event_poller.poll()
+        except select.error as err:
+            if err.args[0] == errno.EINTR:
+                events = []
+            else:
+                sys.stderr.write("Error with poll(); %s\n" % err)
+                break
+
+        for (fd, event) in events:
+            if fd == cc_fd:
+                boss_of_bind.recv_and_process_cc_msg()
+            elif fd == wakeup_fd:
+                os.read(wakeup_fd, 32)
+
+        boss_of_bind.restart_processes()
 
 
+    # shutdown
+    signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+    boss_of_bind.shutdown()