Browse Source

[1342] reintroduce delayed restarts

Jelte Jansen 13 years ago
parent
commit
16b7feca03
2 changed files with 54 additions and 36 deletions
  1. 17 34
      src/bin/bind10/bind10_src.py.in
  2. 37 2
      src/lib/python/isc/bind10/component.py

+ 17 - 34
src/bin/bind10/bind10_src.py.in

@@ -257,6 +257,7 @@ class BoB:
         # inapropriate. But as the code isn't probably completely ready
         # for it, we leave it at components for now.
         self.components = {}
+        self.components_to_restart = []
         self.runnable = False
         self.uid = setuid
         self.username = username
@@ -817,7 +818,11 @@ class BoB:
                     # Tell it it failed. But only if it matters (we are
                     # not shutting down and the component considers itself
                     # to be running.
-                    component.failed(exit_status);
+                    component_restarted = component.failed(exit_status);
+                    # if the process wants to be restarted, but not just yet,
+                    # it returns False
+                    if not component_restarted:
+                        self.components_to_restart.append(component)
             else:
                 logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
 
@@ -833,39 +838,17 @@ class BoB:
             timeout value.
 
         """
-        # TODO: This is an artefact of previous way of handling processes. The
-        # restart queue is currently empty at all times, so this returns None
-        # every time it is called (thought is a relict that is obviously wrong,
-        # it is called and it doesn't hurt).
-        #
-        # It is preserved for archeological reasons for the time when we return
-        # the delayed restarts, most of it might be useful then (or, if it is
-        # found useless, removed).
-        next_restart = None
-        # if we're shutting down, then don't restart
-        if not self.runnable:
-            return 0
-        # otherwise look through each dead process and try to restart
-        still_dead = {}
-        now = time.time()
-        for proc_info in self.dead_processes.values():
-            restart_time = proc_info.restart_schedule.get_restart_time(now)
-            if restart_time > now:
-                if (next_restart is None) or (next_restart > restart_time):
-                    next_restart = restart_time
-                still_dead[proc_info.pid] = proc_info
-            else:
-                logger.info(BIND10_RESURRECTING_PROCESS, proc_info.name)
-                try:
-                    proc_info.respawn()
-                    self.components[proc_info.pid] = proc_info
-                    logger.info(BIND10_RESURRECTED_PROCESS, proc_info.name, proc_info.pid)
-                except:
-                    still_dead[proc_info.pid] = proc_info
-        # remember any processes that refuse to be resurrected
-        self.dead_processes = still_dead
-        # return the time when the next process is ready to be restarted
-        return next_restart
+        still_dead = []
+        next_restart_time = None
+        for component in self.components_to_restart:
+            if not component.restart():
+                still_dead.append(component)
+                if next_restart_time is None or\
+                   next_restart_time > component.get_restart_time():
+                    next_restart_time = component.get_restart_time()
+        self.components_to_restart = still_dead
+
+        return next_restart_time
 
 # global variables, needed for signal handlers
 options = None

+ 37 - 2
src/lib/python/isc/bind10/component.py

@@ -83,7 +83,7 @@ class BaseComponent:
     that is already shutting down, impossible to stop, etc. We need to add more
     states in future to handle it properly.
     """
-    def __init__(self, boss, kind):
+    def __init__(self, boss, kind, restart_delay = 10):
         """
         Creates the component in not running mode.
 
@@ -103,6 +103,10 @@ class BaseComponent:
           * 'dispensable' means the component should be running, but if it
             doesn't start or crashes for some reason, the system simply tries
             to restart it and keeps running.
+        - `restart_delay`: when a component dies, and it has been running
+           for less time (in seconds) than this value, it is not immediately
+           restarted, but the Boss process waits until the delay has passed.
+           If it has been running for longer, it is immediately restarted.
 
         Note that the __init__ method of child class should have these
         parameters:
@@ -134,6 +138,7 @@ class BaseComponent:
         self.__state = STATE_STOPPED
         self._kind = kind
         self._boss = boss
+        self._restart_delay = restart_delay
 
     def start(self):
         """
@@ -188,6 +193,11 @@ class BaseComponent:
         The exit code is used for logging. It might be None.
 
         It calls _failed_internal internally.
+
+        Returns True if the process was immediately restarted, returns
+                False is the process was not restarted, either because
+                it is considered a core or needed component, or because
+                the component is to be restarted later.
         """
         logger.error(BIND10_COMPONENT_FAILED, self.name(), self.pid(),
                      exit_code if exit_code is not None else "unknown")
@@ -203,10 +213,35 @@ class BaseComponent:
             self.__state = STATE_DEAD
             logger.fatal(BIND10_COMPONENT_UNSATISFIED, self.name())
             self._boss.component_shutdown(1)
+            return False
         # This means we want to restart
         else:
-            logger.warn(BIND10_COMPONENT_RESTART, self.name())
+            # if we were only running for a short time, don't restart
+            # but return a time we want to restart at
+            self.set_restart_time()
+            return self.restart()
+
+    def set_restart_time(self):
+        """Calculates and sets the time this component should be restarted.
+           Currently, it uses a very basic algorithm; start time + restart
+           delay in seconds. This algorithm may be improved upon in the
+           future.
+        """
+        self._restart_at = self.__start_time + self._restart_delay
+
+    def get_restart_time(self):
+        """Returns the time at which this component should be restarted."""
+        return self._restart_at
+
+    def restart(self):
+        """Restarts the component if the restart time if smaller than 'now'
+           Returns True if the component is restarted, False if not"""
+        now = time.time()
+        if self.get_restart_time() < now:
             self.start()
+            return True
+        else:
+            return False
 
     def running(self):
         """